howard.objects.variants

    1import csv
    2import gc
    3import gzip
    4import io
    5import multiprocessing
    6import os
    7import random
    8import re
    9import shlex
   10import sqlite3
   11import subprocess
   12from tempfile import NamedTemporaryFile, TemporaryDirectory
   13import tempfile
   14import duckdb
   15import json
   16import yaml
   17import argparse
   18import Bio.bgzf as bgzf
   19import pandas as pd
   20from pyfaidx import Fasta
   21import numpy as np
   22import vcf
   23import logging as log
   24import fastparquet as fp
   25from multiprocesspandas import applyparallel
   26
   27from howard.functions.commons import *
   28from howard.objects.database import *
   29from howard.functions.databases import *
   30from howard.functions.utils import *
   31
   32
   33class Variants:
   34
   35    def __init__(
   36        self,
   37        conn=None,
   38        input: str = None,
   39        output: str = None,
   40        config: dict = {},
   41        param: dict = {},
   42        load: bool = False,
   43    ) -> None:
   44        """
   45        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   46        header
   47
   48        :param conn: the connection to the database
   49        :param input: the input file
   50        :param output: the output file
   51        :param config: a dictionary containing the configuration of the model
   52        :param param: a dictionary containing the parameters of the model
   53        """
   54
   55        # Init variables
   56        self.init_variables()
   57
   58        # Input
   59        self.set_input(input)
   60
   61        # Config
   62        self.set_config(config)
   63
   64        # Param
   65        self.set_param(param)
   66
   67        # Output
   68        self.set_output(output)
   69
   70        # connexion
   71        self.set_connexion(conn)
   72
   73        # Header
   74        self.set_header()
   75
   76        # Samples
   77        self.set_samples()
   78
   79        # Load data
   80        if load:
   81            self.load_data()
   82
   83    def set_samples(self, samples: list = None) -> list:
   84        """
   85        The function `set_samples` sets the samples attribute of an object to a provided list or
   86        retrieves it from a parameter dictionary.
   87
   88        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   89        input and sets the `samples` attribute of the class to the provided list. If no samples are
   90        provided, it tries to get the samples from the class's parameters using the `get_param` method
   91        :type samples: list
   92        :return: The `samples` list is being returned.
   93        """
   94
   95        if not samples:
   96            samples = self.get_param().get("samples", {}).get("list", None)
   97
   98        self.samples = samples
   99
  100        return samples
  101
  102    def get_samples(self) -> list:
  103        """
  104        This function returns a list of samples.
  105        :return: The `get_samples` method is returning the `samples` attribute of the object.
  106        """
  107
  108        return self.samples
  109
  110    def get_samples_check(self) -> bool:
  111        """
  112        This function returns the value of the "check" key within the "samples" dictionary retrieved
  113        from the parameters.
  114        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  115        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  116        method. If the key "check" is not found, it will return `False`.
  117        """
  118
  119        return self.get_param().get("samples", {}).get("check", True)
  120
  121    def set_input(self, input: str = None) -> None:
  122        """
  123        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  124        attributes in the class accordingly.
  125
  126        :param input: The `set_input` method in the provided code snippet is used to set attributes
  127        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  128        :type input: str
  129        """
  130
  131        if input and not isinstance(input, str):
  132            try:
  133                self.input = input.name
  134            except:
  135                log.error(f"Input file '{input} in bad format")
  136                raise ValueError(f"Input file '{input} in bad format")
  137        else:
  138            self.input = input
  139
  140        # Input format
  141        if input:
  142            input_name, input_extension = os.path.splitext(self.input)
  143            self.input_name = input_name
  144            self.input_extension = input_extension
  145            self.input_format = self.input_extension.replace(".", "")
  146
  147    def set_config(self, config: dict) -> None:
  148        """
  149        The set_config function takes a config object and assigns it as the configuration object for the
  150        class.
  151
  152        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  153        contains configuration settings for the class. When you call the `set_config` function with a
  154        dictionary object as the argument, it will set that dictionary as the configuration object for
  155        the class
  156        :type config: dict
  157        """
  158
  159        self.config = config
  160
  161    def set_param(self, param: dict) -> None:
  162        """
  163        This function sets a parameter object for the class based on the input dictionary.
  164
  165        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  166        as the `param` attribute of the class instance
  167        :type param: dict
  168        """
  169
  170        self.param = param
  171
  172    def init_variables(self) -> None:
  173        """
  174        This function initializes the variables that will be used in the rest of the class
  175        """
  176
  177        self.prefix = "howard"
  178        self.table_variants = "variants"
  179        self.dataframe = None
  180
  181        self.comparison_map = {
  182            "gt": ">",
  183            "gte": ">=",
  184            "lt": "<",
  185            "lte": "<=",
  186            "equals": "=",
  187            "contains": "SIMILAR TO",
  188        }
  189
  190        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  191
  192        self.code_type_map_to_sql = {
  193            "Integer": "INTEGER",
  194            "String": "VARCHAR",
  195            "Float": "FLOAT",
  196            "Flag": "VARCHAR",
  197        }
  198
  199        self.index_additionnal_fields = []
  200
  201    def get_indexing(self) -> bool:
  202        """
  203        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  204        returns False.
  205        :return: The value of the indexing parameter.
  206        """
  207
  208        return self.get_param().get("indexing", False)
  209
  210    def get_connexion_config(self) -> dict:
  211        """
  212        The function `get_connexion_config` returns a dictionary containing the configuration for a
  213        connection, including the number of threads and memory limit.
  214        :return: a dictionary containing the configuration for the Connexion library.
  215        """
  216
  217        # config
  218        config = self.get_config()
  219
  220        # Connexion config
  221        connexion_config = {}
  222        threads = self.get_threads()
  223
  224        # Threads
  225        if threads:
  226            connexion_config["threads"] = threads
  227
  228        # Memory
  229        # if config.get("memory", None):
  230        #     connexion_config["memory_limit"] = config.get("memory")
  231        if self.get_memory():
  232            connexion_config["memory_limit"] = self.get_memory()
  233
  234        # Temporary directory
  235        if config.get("tmp", None):
  236            connexion_config["temp_directory"] = config.get("tmp")
  237
  238        # Access
  239        if config.get("access", None):
  240            access = config.get("access")
  241            if access in ["RO"]:
  242                access = "READ_ONLY"
  243            elif access in ["RW"]:
  244                access = "READ_WRITE"
  245            connexion_db = self.get_connexion_db()
  246            if connexion_db in ":memory:":
  247                access = "READ_WRITE"
  248            connexion_config["access_mode"] = access
  249
  250        return connexion_config
  251
  252    def get_duckdb_settings(self) -> dict:
  253        """
  254        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  255        string.
  256        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  257        """
  258
  259        # config
  260        config = self.get_config()
  261
  262        # duckdb settings
  263        duckdb_settings_dict = {}
  264        if config.get("duckdb_settings", None):
  265            duckdb_settings = config.get("duckdb_settings")
  266            duckdb_settings = full_path(duckdb_settings)
  267            # duckdb setting is a file
  268            if os.path.exists(duckdb_settings):
  269                with open(duckdb_settings) as json_file:
  270                    duckdb_settings_dict = yaml.safe_load(json_file)
  271            # duckdb settings is a string
  272            else:
  273                duckdb_settings_dict = json.loads(duckdb_settings)
  274
  275        return duckdb_settings_dict
  276
  277    def set_connexion_db(self) -> str:
  278        """
  279        The function `set_connexion_db` returns the appropriate database connection string based on the
  280        input format and connection type.
  281        :return: the value of the variable `connexion_db`.
  282        """
  283
  284        # Default connexion db
  285        default_connexion_db = ":memory:"
  286
  287        # Find connexion db
  288        if self.get_input_format() in ["db", "duckdb"]:
  289            connexion_db = self.get_input()
  290        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  291            connexion_db = default_connexion_db
  292        elif self.get_connexion_type() in ["tmpfile"]:
  293            tmp_name = tempfile.mkdtemp(
  294                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  295            )
  296            connexion_db = f"{tmp_name}/tmp.db"
  297        elif self.get_connexion_type() != "":
  298            connexion_db = self.get_connexion_type()
  299        else:
  300            connexion_db = default_connexion_db
  301
  302        # Set connexion db
  303        self.connexion_db = connexion_db
  304
  305        return connexion_db
  306
  307    def set_connexion(self, conn) -> None:
  308        """
  309        The function `set_connexion` creates a connection to a database, with options for different
  310        database formats and settings.
  311
  312        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  313        database. If a connection is not provided, a new connection to an in-memory database is created.
  314        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  315        sqlite
  316        """
  317
  318        # Connexion db
  319        connexion_db = self.set_connexion_db()
  320
  321        # Connexion config
  322        connexion_config = self.get_connexion_config()
  323
  324        # Connexion format
  325        connexion_format = self.get_config().get("connexion_format", "duckdb")
  326        # Set connexion format
  327        self.connexion_format = connexion_format
  328
  329        # Connexion
  330        if not conn:
  331            if connexion_format in ["duckdb"]:
  332                conn = duckdb.connect(connexion_db, config=connexion_config)
  333                # duckDB settings
  334                duckdb_settings = self.get_duckdb_settings()
  335                if duckdb_settings:
  336                    for setting in duckdb_settings:
  337                        setting_value = duckdb_settings.get(setting)
  338                        if isinstance(setting_value, str):
  339                            setting_value = f"'{setting_value}'"
  340                        conn.execute(f"PRAGMA {setting}={setting_value};")
  341            elif connexion_format in ["sqlite"]:
  342                conn = sqlite3.connect(connexion_db)
  343
  344        # Set connexion
  345        self.conn = conn
  346
  347        # Log
  348        log.debug(f"connexion_format: {connexion_format}")
  349        log.debug(f"connexion_db: {connexion_db}")
  350        log.debug(f"connexion config: {connexion_config}")
  351        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  352
  353    def set_output(self, output: str = None) -> None:
  354        """
  355        The `set_output` function in Python sets the output file based on the input or a specified key
  356        in the config file, extracting the output name, extension, and format.
  357
  358        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  359        the output file. If the config file has an 'output' key, the method sets the output to the value
  360        of that key. If no output is provided, it sets the output to `None`
  361        :type output: str
  362        """
  363
  364        if output and not isinstance(output, str):
  365            self.output = output.name
  366        else:
  367            self.output = output
  368
  369        # Output format
  370        if self.output:
  371            output_name, output_extension = os.path.splitext(self.output)
  372            self.output_name = output_name
  373            self.output_extension = output_extension
  374            self.output_format = self.output_extension.replace(".", "")
  375        else:
  376            self.output_name = None
  377            self.output_extension = None
  378            self.output_format = None
  379
  380    def set_header(self) -> None:
  381        """
  382        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  383        """
  384
  385        input_file = self.get_input()
  386        default_header_list = [
  387            "##fileformat=VCFv4.2",
  388            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  389        ]
  390
  391        # Full path
  392        input_file = full_path(input_file)
  393
  394        if input_file:
  395
  396            input_format = self.get_input_format()
  397            input_compressed = self.get_input_compressed()
  398            config = self.get_config()
  399            header_list = default_header_list
  400            if input_format in [
  401                "vcf",
  402                "hdr",
  403                "tsv",
  404                "csv",
  405                "psv",
  406                "parquet",
  407                "db",
  408                "duckdb",
  409            ]:
  410                # header provided in param
  411                if config.get("header_file", None):
  412                    with open(config.get("header_file"), "rt") as f:
  413                        header_list = self.read_vcf_header(f)
  414                # within a vcf file format (header within input file itsself)
  415                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  416                    # within a compressed vcf file format (.vcf.gz)
  417                    if input_compressed:
  418                        with bgzf.open(input_file, "rt") as f:
  419                            header_list = self.read_vcf_header(f)
  420                    # within an uncompressed vcf file format (.vcf)
  421                    else:
  422                        with open(input_file, "rt") as f:
  423                            header_list = self.read_vcf_header(f)
  424                # header provided in default external file .hdr
  425                elif os.path.exists((input_file + ".hdr")):
  426                    with open(input_file + ".hdr", "rt") as f:
  427                        header_list = self.read_vcf_header(f)
  428                else:
  429                    try:  # Try to get header info fields and file columns
  430
  431                        with tempfile.TemporaryDirectory() as tmpdir:
  432
  433                            # Create database
  434                            db_for_header = Database(database=input_file)
  435
  436                            # Get header columns for infos fields
  437                            db_header_from_columns = (
  438                                db_for_header.get_header_from_columns()
  439                            )
  440
  441                            # Get real columns in the file
  442                            db_header_columns = db_for_header.get_columns()
  443
  444                            # Write header file
  445                            header_file_tmp = os.path.join(tmpdir, "header")
  446                            f = open(header_file_tmp, "w")
  447                            vcf.Writer(f, db_header_from_columns)
  448                            f.close()
  449
  450                            # Replace #CHROM line with rel columns
  451                            header_list = db_for_header.read_header_file(
  452                                header_file=header_file_tmp
  453                            )
  454                            header_list[-1] = "\t".join(db_header_columns)
  455
  456                    except:
  457
  458                        log.warning(
  459                            f"No header for file {input_file}. Set as default VCF header"
  460                        )
  461                        header_list = default_header_list
  462
  463            else:  # try for unknown format ?
  464
  465                log.error(f"Input file format '{input_format}' not available")
  466                raise ValueError(f"Input file format '{input_format}' not available")
  467
  468            if not header_list:
  469                header_list = default_header_list
  470
  471            # header as list
  472            self.header_list = header_list
  473
  474            # header as VCF object
  475            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  476
  477        else:
  478
  479            self.header_list = None
  480            self.header_vcf = None
  481
  482    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  483        """
  484        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  485        DataFrame based on the connection format.
  486
  487        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  488        represents the SQL query you want to execute. This query will be used to fetch data from a
  489        database and convert it into a pandas DataFrame
  490        :type query: str
  491        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  492        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  493        function will only fetch up to that number of rows from the database query result. If no limit
  494        is specified,
  495        :type limit: int
  496        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  497        """
  498
  499        # Connexion format
  500        connexion_format = self.get_connexion_format()
  501
  502        # Limit in query
  503        if limit:
  504            pd.set_option("display.max_rows", limit)
  505            if connexion_format in ["duckdb"]:
  506                df = (
  507                    self.conn.execute(query)
  508                    .fetch_record_batch(limit)
  509                    .read_next_batch()
  510                    .to_pandas()
  511                )
  512            elif connexion_format in ["sqlite"]:
  513                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  514
  515        # Full query
  516        else:
  517            if connexion_format in ["duckdb"]:
  518                df = self.conn.execute(query).df()
  519            elif connexion_format in ["sqlite"]:
  520                df = pd.read_sql_query(query, self.conn)
  521
  522        return df
  523
  524    def get_overview(self) -> None:
  525        """
  526        The function prints the input, output, config, and dataframe of the current object
  527        """
  528        table_variants_from = self.get_table_variants(clause="from")
  529        sql_columns = self.get_header_columns_as_sql()
  530        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  531        df = self.get_query_to_df(sql_query_export)
  532        log.info(
  533            "Input:  "
  534            + str(self.get_input())
  535            + " ["
  536            + str(str(self.get_input_format()))
  537            + "]"
  538        )
  539        log.info(
  540            "Output: "
  541            + str(self.get_output())
  542            + " ["
  543            + str(str(self.get_output_format()))
  544            + "]"
  545        )
  546        log.info("Config: ")
  547        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  548            "\n"
  549        ):
  550            log.info("\t" + str(d))
  551        log.info("Param: ")
  552        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  553            "\n"
  554        ):
  555            log.info("\t" + str(d))
  556        log.info("Sample list: " + str(self.get_header_sample_list()))
  557        log.info("Dataframe: ")
  558        for d in str(df).split("\n"):
  559            log.info("\t" + str(d))
  560
  561        # garbage collector
  562        del df
  563        gc.collect()
  564
  565        return None
  566
  567    def get_stats(self) -> dict:
  568        """
  569        The `get_stats` function calculates and returns various statistics of the current object,
  570        including information about the input file, variants, samples, header fields, quality, and
  571        SNVs/InDels.
  572        :return: a dictionary containing various statistics of the current object. The dictionary has
  573        the following structure:
  574        """
  575
  576        # Log
  577        log.info(f"Stats Calculation...")
  578
  579        # table varaints
  580        table_variants_from = self.get_table_variants()
  581
  582        # stats dict
  583        stats = {"Infos": {}}
  584
  585        ### File
  586        input_file = self.get_input()
  587        stats["Infos"]["Input file"] = input_file
  588
  589        # Header
  590        header_infos = self.get_header().infos
  591        header_formats = self.get_header().formats
  592        header_infos_list = list(header_infos)
  593        header_formats_list = list(header_formats)
  594
  595        ### Variants
  596
  597        stats["Variants"] = {}
  598
  599        # Variants by chr
  600        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  601        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  602        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  603            by=["CHROM"], kind="quicksort"
  604        )
  605
  606        # Total number of variants
  607        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  608
  609        # Calculate percentage
  610        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  611            lambda x: (x / nb_of_variants)
  612        )
  613
  614        stats["Variants"]["Number of variants by chromosome"] = (
  615            nb_of_variants_by_chrom.to_dict(orient="index")
  616        )
  617
  618        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  619
  620        ### Samples
  621
  622        # Init
  623        samples = {}
  624        nb_of_samples = 0
  625
  626        # Check Samples
  627        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  628            log.debug(f"Check samples...")
  629            for sample in self.get_header_sample_list():
  630                sql_query_samples = f"""
  631                    SELECT  '{sample}' as sample,
  632                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  633                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  634                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  635                    FROM {table_variants_from}
  636                    WHERE (
  637                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  638                        AND
  639                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  640                      )
  641                    GROUP BY genotype
  642                    """
  643                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  644                sample_genotype_count = sql_query_genotype_df["count"].sum()
  645                if len(sql_query_genotype_df):
  646                    nb_of_samples += 1
  647                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  648                        sql_query_genotype_df.to_dict(orient="index")
  649                    )
  650
  651            stats["Samples"] = samples
  652            stats["Infos"]["Number of samples"] = nb_of_samples
  653
  654        # #
  655        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  656        #     stats["Infos"]["Number of samples"] = nb_of_samples
  657        # elif nb_of_samples:
  658        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  659
  660        ### INFO and FORMAT fields
  661        header_types_df = {}
  662        header_types_list = {
  663            "List of INFO fields": header_infos,
  664            "List of FORMAT fields": header_formats,
  665        }
  666        i = 0
  667        for header_type in header_types_list:
  668
  669            header_type_infos = header_types_list.get(header_type)
  670            header_infos_dict = {}
  671
  672            for info in header_type_infos:
  673
  674                i += 1
  675                header_infos_dict[i] = {}
  676
  677                # ID
  678                header_infos_dict[i]["id"] = info
  679
  680                # num
  681                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  682                if header_type_infos[info].num in genotype_map.keys():
  683                    header_infos_dict[i]["Number"] = genotype_map.get(
  684                        header_type_infos[info].num
  685                    )
  686                else:
  687                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  688
  689                # type
  690                if header_type_infos[info].type:
  691                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  692                else:
  693                    header_infos_dict[i]["Type"] = "."
  694
  695                # desc
  696                if header_type_infos[info].desc != None:
  697                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  698                else:
  699                    header_infos_dict[i]["Description"] = ""
  700
  701            if len(header_infos_dict):
  702                header_types_df[header_type] = pd.DataFrame.from_dict(
  703                    header_infos_dict, orient="index"
  704                ).to_dict(orient="index")
  705
  706        # Stats
  707        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  708        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  709        stats["Header"] = header_types_df
  710
  711        ### QUAL
  712        if "QUAL" in self.get_header_columns():
  713            sql_query_qual = f"""
  714                    SELECT
  715                        avg(CAST(QUAL AS INTEGER)) AS Average,
  716                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  717                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  718                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  719                        median(CAST(QUAL AS INTEGER)) AS Median,
  720                        variance(CAST(QUAL AS INTEGER)) AS Variance
  721                    FROM {table_variants_from}
  722                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  723                    """
  724
  725            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  726            stats["Quality"] = {"Stats": qual}
  727
  728        ### SNV and InDel
  729
  730        sql_query_snv = f"""
  731            
  732            SELECT Type, count FROM (
  733
  734                    SELECT
  735                        'Total' AS Type,
  736                        count(*) AS count
  737                    FROM {table_variants_from}
  738
  739                    UNION
  740
  741                    SELECT
  742                        'MNV' AS Type,
  743                        count(*) AS count
  744                    FROM {table_variants_from}
  745                    WHERE len(REF) > 1 AND len(ALT) > 1
  746                    AND len(REF) = len(ALT)
  747
  748                    UNION
  749
  750                    SELECT
  751                        'InDel' AS Type,
  752                        count(*) AS count
  753                    FROM {table_variants_from}
  754                    WHERE len(REF) > 1 OR len(ALT) > 1
  755                    AND len(REF) != len(ALT)
  756                    
  757                    UNION
  758
  759                    SELECT
  760                        'SNV' AS Type,
  761                        count(*) AS count
  762                    FROM {table_variants_from}
  763                    WHERE len(REF) = 1 AND len(ALT) = 1
  764
  765                )
  766
  767            ORDER BY count DESC
  768
  769                """
  770        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  771
  772        sql_query_snv_substitution = f"""
  773                SELECT
  774                    concat(REF, '>', ALT) AS 'Substitution',
  775                    count(*) AS count
  776                FROM {table_variants_from}
  777                WHERE len(REF) = 1 AND len(ALT) = 1
  778                GROUP BY REF, ALT
  779                ORDER BY count(*) DESC
  780                """
  781        snv_substitution = (
  782            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  783        )
  784        stats["Variants"]["Counts"] = snv_indel
  785        stats["Variants"]["Substitutions"] = snv_substitution
  786
  787        return stats
  788
  789    def stats_to_file(self, file: str = None) -> str:
  790        """
  791        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  792        into a JSON object, and writes the JSON object to the specified file.
  793
  794        :param file: The `file` parameter is a string that represents the file path where the JSON data
  795        will be written
  796        :type file: str
  797        :return: the name of the file that was written to.
  798        """
  799
  800        # Get stats
  801        stats = self.get_stats()
  802
  803        # Serializing json
  804        json_object = json.dumps(stats, indent=4)
  805
  806        # Writing to sample.json
  807        with open(file, "w") as outfile:
  808            outfile.write(json_object)
  809
  810        return file
  811
  812    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  813        """
  814        The `print_stats` function generates a markdown file and prints the statistics contained in a
  815        JSON file in a formatted manner.
  816
  817        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  818        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  819        provided, a temporary directory will be created and the stats will be saved in a file named
  820        "stats.md" within that
  821        :type output_file: str
  822        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  823        file where the statistics will be saved. If no value is provided, a temporary directory will be
  824        created and a default file name "stats.json" will be used
  825        :type json_file: str
  826        :return: The function `print_stats` does not return any value. It has a return type annotation
  827        of `None`.
  828        """
  829
  830        # Full path
  831        output_file = full_path(output_file)
  832        json_file = full_path(json_file)
  833
  834        with tempfile.TemporaryDirectory() as tmpdir:
  835
  836            # Files
  837            if not output_file:
  838                output_file = os.path.join(tmpdir, "stats.md")
  839            if not json_file:
  840                json_file = os.path.join(tmpdir, "stats.json")
  841
  842            # Create folders
  843            if not os.path.exists(os.path.dirname(output_file)):
  844                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  845            if not os.path.exists(os.path.dirname(json_file)):
  846                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  847
  848            # Create stats JSON file
  849            stats_file = self.stats_to_file(file=json_file)
  850
  851            # Print stats file
  852            with open(stats_file) as f:
  853                stats = yaml.safe_load(f)
  854
  855            # Output
  856            output_title = []
  857            output_index = []
  858            output = []
  859
  860            # Title
  861            output_title.append("# HOWARD Stats")
  862
  863            # Index
  864            output_index.append("## Index")
  865
  866            # Process sections
  867            for section in stats:
  868                infos = stats.get(section)
  869                section_link = "#" + section.lower().replace(" ", "-")
  870                output.append(f"## {section}")
  871                output_index.append(f"- [{section}]({section_link})")
  872
  873                if len(infos):
  874                    for info in infos:
  875                        try:
  876                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  877                            is_df = True
  878                        except:
  879                            try:
  880                                df = pd.DataFrame.from_dict(
  881                                    json.loads((infos.get(info))), orient="index"
  882                                )
  883                                is_df = True
  884                            except:
  885                                is_df = False
  886                        if is_df:
  887                            output.append(f"### {info}")
  888                            info_link = "#" + info.lower().replace(" ", "-")
  889                            output_index.append(f"   - [{info}]({info_link})")
  890                            output.append(f"{df.to_markdown(index=False)}")
  891                        else:
  892                            output.append(f"- {info}: {infos.get(info)}")
  893                else:
  894                    output.append(f"NA")
  895
  896            # Write stats in markdown file
  897            with open(output_file, "w") as fp:
  898                for item in output_title:
  899                    fp.write("%s\n" % item)
  900                for item in output_index:
  901                    fp.write("%s\n" % item)
  902                for item in output:
  903                    fp.write("%s\n" % item)
  904
  905            # Output stats in markdown
  906            print("")
  907            print("\n\n".join(output_title))
  908            print("")
  909            print("\n\n".join(output))
  910            print("")
  911
  912        return None
  913
  914    def get_input(self) -> str:
  915        """
  916        It returns the value of the input variable.
  917        :return: The input is being returned.
  918        """
  919        return self.input
  920
  921    def get_input_format(self, input_file: str = None) -> str:
  922        """
  923        This function returns the format of the input variable, either from the provided input file or
  924        by prompting for input.
  925
  926        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  927        represents the file path of the input file. If no `input_file` is provided when calling the
  928        method, it will default to `None`
  929        :type input_file: str
  930        :return: The format of the input variable is being returned.
  931        """
  932
  933        if not input_file:
  934            input_file = self.get_input()
  935        input_format = get_file_format(input_file)
  936        return input_format
  937
  938    def get_input_compressed(self, input_file: str = None) -> str:
  939        """
  940        The function `get_input_compressed` returns the format of the input variable after compressing
  941        it.
  942
  943        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  944        that represents the file path of the input file. If no `input_file` is provided when calling the
  945        method, it will default to `None` and the method will then call `self.get_input()` to
  946        :type input_file: str
  947        :return: The function `get_input_compressed` returns the compressed format of the input
  948        variable.
  949        """
  950
  951        if not input_file:
  952            input_file = self.get_input()
  953        input_compressed = get_file_compressed(input_file)
  954        return input_compressed
  955
  956    def get_output(self) -> str:
  957        """
  958        It returns the output of the neuron.
  959        :return: The output of the neural network.
  960        """
  961
  962        return self.output
  963
  964    def get_output_format(self, output_file: str = None) -> str:
  965        """
  966        The function `get_output_format` returns the format of the input variable or the output file if
  967        provided.
  968
  969        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  970        that represents the file path of the output file. If no `output_file` is provided when calling
  971        the method, it will default to the output obtained from the `get_output` method of the class
  972        instance. The
  973        :type output_file: str
  974        :return: The format of the input variable is being returned.
  975        """
  976
  977        if not output_file:
  978            output_file = self.get_output()
  979        output_format = get_file_format(output_file)
  980
  981        return output_format
  982
  983    def get_config(self) -> dict:
  984        """
  985        It returns the config
  986        :return: The config variable is being returned.
  987        """
  988        return self.config
  989
  990    def get_param(self) -> dict:
  991        """
  992        It returns the param
  993        :return: The param variable is being returned.
  994        """
  995        return self.param
  996
  997    def get_connexion_db(self) -> str:
  998        """
  999        It returns the connexion_db attribute of the object
 1000        :return: The connexion_db is being returned.
 1001        """
 1002        return self.connexion_db
 1003
 1004    def get_prefix(self) -> str:
 1005        """
 1006        It returns the prefix of the object.
 1007        :return: The prefix is being returned.
 1008        """
 1009        return self.prefix
 1010
 1011    def get_table_variants(self, clause: str = "select") -> str:
 1012        """
 1013        This function returns the table_variants attribute of the object
 1014
 1015        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1016        defaults to select (optional)
 1017        :return: The table_variants attribute of the object.
 1018        """
 1019
 1020        # Access
 1021        access = self.get_config().get("access", None)
 1022
 1023        # Clauses "select", "where", "update"
 1024        if clause in ["select", "where", "update"]:
 1025            table_variants = self.table_variants
 1026        # Clause "from"
 1027        elif clause in ["from"]:
 1028            # For Read Only
 1029            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1030                input_file = self.get_input()
 1031                table_variants = f"'{input_file}' as variants"
 1032            # For Read Write
 1033            else:
 1034                table_variants = f"{self.table_variants} as variants"
 1035        else:
 1036            table_variants = self.table_variants
 1037        return table_variants
 1038
 1039    def get_tmp_dir(self) -> str:
 1040        """
 1041        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1042        parameters or a default path.
 1043        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1044        configuration, parameters, and a default value of "/tmp".
 1045        """
 1046
 1047        return get_tmp(
 1048            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1049        )
 1050
 1051    def get_connexion_type(self) -> str:
 1052        """
 1053        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1054
 1055        :return: The connexion type is being returned.
 1056        """
 1057        return self.get_config().get("connexion_type", "memory")
 1058
 1059    def get_connexion(self):
 1060        """
 1061        It returns the connection object
 1062
 1063        :return: The connection object.
 1064        """
 1065        return self.conn
 1066
 1067    def close_connexion(self) -> None:
 1068        """
 1069        This function closes the connection to the database.
 1070        :return: The connection is being closed.
 1071        """
 1072        return self.conn.close()
 1073
 1074    def get_header(self, type: str = "vcf"):
 1075        """
 1076        This function returns the header of the VCF file as a list of strings
 1077
 1078        :param type: the type of header you want to get, defaults to vcf (optional)
 1079        :return: The header of the vcf file.
 1080        """
 1081
 1082        if self.header_vcf:
 1083            if type == "vcf":
 1084                return self.header_vcf
 1085            elif type == "list":
 1086                return self.header_list
 1087        else:
 1088            if type == "vcf":
 1089                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1090                return header
 1091            elif type == "list":
 1092                return vcf_required
 1093
 1094    def get_header_length(self, file: str = None) -> int:
 1095        """
 1096        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1097        line.
 1098
 1099        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1100        header file. If this argument is provided, the function will read the header from the specified
 1101        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1102        :type file: str
 1103        :return: the length of the header list, excluding the #CHROM line.
 1104        """
 1105
 1106        if file:
 1107            return len(self.read_vcf_header_file(file=file)) - 1
 1108        elif self.get_header(type="list"):
 1109            return len(self.get_header(type="list")) - 1
 1110        else:
 1111            return 0
 1112
 1113    def get_header_columns(self) -> str:
 1114        """
 1115        This function returns the header list of a VCF
 1116
 1117        :return: The length of the header list.
 1118        """
 1119        if self.get_header():
 1120            return self.get_header(type="list")[-1]
 1121        else:
 1122            return ""
 1123
 1124    def get_header_columns_as_list(self) -> list:
 1125        """
 1126        This function returns the header list of a VCF
 1127
 1128        :return: The length of the header list.
 1129        """
 1130        if self.get_header():
 1131            return self.get_header_columns().strip().split("\t")
 1132        else:
 1133            return []
 1134
 1135    def get_header_columns_as_sql(self) -> str:
 1136        """
 1137        This function retruns header length (without #CHROM line)
 1138
 1139        :return: The length of the header list.
 1140        """
 1141        sql_column_list = []
 1142        for col in self.get_header_columns_as_list():
 1143            sql_column_list.append(f'"{col}"')
 1144        return ",".join(sql_column_list)
 1145
 1146    def get_header_sample_list(
 1147        self, check: bool = False, samples: list = None, samples_force: bool = False
 1148    ) -> list:
 1149        """
 1150        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1151        checking and filtering based on input parameters.
 1152
 1153        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1154        parameter that determines whether to check if the samples in the list are properly defined as
 1155        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1156        list is defined as a, defaults to False
 1157        :type check: bool (optional)
 1158        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1159        allows you to specify a subset of samples from the header. If you provide a list of sample
 1160        names, the function will check if each sample is defined in the header. If a sample is not found
 1161        in the
 1162        :type samples: list
 1163        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1164        a boolean parameter that determines whether to force the function to return the sample list
 1165        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1166        function will return the sample list without performing, defaults to False
 1167        :type samples_force: bool (optional)
 1168        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1169        parameters and conditions specified in the function.
 1170        """
 1171
 1172        # Init
 1173        samples_list = []
 1174
 1175        if samples is None:
 1176            samples_list = self.header_vcf.samples
 1177        else:
 1178            samples_checked = []
 1179            for sample in samples:
 1180                if sample in self.header_vcf.samples:
 1181                    samples_checked.append(sample)
 1182                else:
 1183                    log.warning(f"Sample '{sample}' not defined in header")
 1184            samples_list = samples_checked
 1185
 1186            # Force sample list without checking if is_genotype_column
 1187            if samples_force:
 1188                log.warning(f"Samples {samples_list} not checked if genotypes")
 1189                return samples_list
 1190
 1191        if check:
 1192            samples_checked = []
 1193            for sample in samples_list:
 1194                if self.is_genotype_column(column=sample):
 1195                    samples_checked.append(sample)
 1196                else:
 1197                    log.warning(
 1198                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1199                    )
 1200            samples_list = samples_checked
 1201
 1202        # Return samples list
 1203        return samples_list
 1204
 1205    def is_genotype_column(self, column: str = None) -> bool:
 1206        """
 1207        This function checks if a given column is a genotype column in a database.
 1208
 1209        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1210        represents the column name in a database table. This method checks if the specified column is a
 1211        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1212        method of
 1213        :type column: str
 1214        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1215        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1216        column name and returns the result. If the `column` parameter is None, it returns False.
 1217        """
 1218
 1219        if column is not None:
 1220            return Database(database=self.get_input()).is_genotype_column(column=column)
 1221        else:
 1222            return False
 1223
 1224    def get_verbose(self) -> bool:
 1225        """
 1226        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1227        exist
 1228
 1229        :return: The value of the key "verbose" in the config dictionary.
 1230        """
 1231        return self.get_config().get("verbose", False)
 1232
 1233    def get_connexion_format(self) -> str:
 1234        """
 1235        It returns the connexion format of the object.
 1236        :return: The connexion_format is being returned.
 1237        """
 1238        connexion_format = self.connexion_format
 1239        if connexion_format not in ["duckdb", "sqlite"]:
 1240            log.error(f"Unknown connexion format {connexion_format}")
 1241            raise ValueError(f"Unknown connexion format {connexion_format}")
 1242        else:
 1243            return connexion_format
 1244
 1245    def insert_file_to_table(
 1246        self,
 1247        file,
 1248        columns: str,
 1249        header_len: int = 0,
 1250        sep: str = "\t",
 1251        chunksize: int = 1000000,
 1252    ) -> None:
 1253        """
 1254        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1255        database format.
 1256
 1257        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1258        the path to the file on your system
 1259        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1260        should contain the names of the columns in the table where the data will be inserted. The column
 1261        names should be separated by commas within the string. For example, if you have columns named
 1262        "id", "name
 1263        :type columns: str
 1264        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1265        the number of lines to skip at the beginning of the file before reading the actual data. This
 1266        parameter allows you to skip any header information present in the file before processing the
 1267        data, defaults to 0
 1268        :type header_len: int (optional)
 1269        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1270        separator character that is used in the file being read. In this case, the default separator is
 1271        set to `\t`, which represents a tab character. You can change this parameter to a different
 1272        separator character if, defaults to \t
 1273        :type sep: str (optional)
 1274        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1275        when processing the file in chunks. In the provided code snippet, the default value for
 1276        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1277        to 1000000
 1278        :type chunksize: int (optional)
 1279        """
 1280
 1281        # Config
 1282        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1283        connexion_format = self.get_connexion_format()
 1284
 1285        log.debug("chunksize: " + str(chunksize))
 1286
 1287        if chunksize:
 1288            for chunk in pd.read_csv(
 1289                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1290            ):
 1291                if connexion_format in ["duckdb"]:
 1292                    sql_insert_into = (
 1293                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1294                    )
 1295                    self.conn.execute(sql_insert_into)
 1296                elif connexion_format in ["sqlite"]:
 1297                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1298
 1299    def load_data(
 1300        self,
 1301        input_file: str = None,
 1302        drop_variants_table: bool = False,
 1303        sample_size: int = 20480,
 1304    ) -> None:
 1305        """
 1306        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1307        table before loading the data and specify a sample size.
 1308
 1309        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1310        table
 1311        :type input_file: str
 1312        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1313        determines whether the variants table should be dropped before loading the data. If set to
 1314        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1315        not be dropped, defaults to False
 1316        :type drop_variants_table: bool (optional)
 1317        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1318        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1319        20480
 1320        :type sample_size: int (optional)
 1321        """
 1322
 1323        log.info("Loading...")
 1324
 1325        # change input file
 1326        if input_file:
 1327            self.set_input(input_file)
 1328            self.set_header()
 1329
 1330        # drop variants table
 1331        if drop_variants_table:
 1332            self.drop_variants_table()
 1333
 1334        # get table variants
 1335        table_variants = self.get_table_variants()
 1336
 1337        # Access
 1338        access = self.get_config().get("access", None)
 1339        log.debug(f"access: {access}")
 1340
 1341        # Input format and compress
 1342        input_format = self.get_input_format()
 1343        input_compressed = self.get_input_compressed()
 1344        log.debug(f"input_format: {input_format}")
 1345        log.debug(f"input_compressed: {input_compressed}")
 1346
 1347        # input_compressed_format
 1348        if input_compressed:
 1349            input_compressed_format = "gzip"
 1350        else:
 1351            input_compressed_format = "none"
 1352        log.debug(f"input_compressed_format: {input_compressed_format}")
 1353
 1354        # Connexion format
 1355        connexion_format = self.get_connexion_format()
 1356
 1357        # Sample size
 1358        if not sample_size:
 1359            sample_size = -1
 1360        log.debug(f"sample_size: {sample_size}")
 1361
 1362        # Load data
 1363        log.debug(f"Load Data from {input_format}")
 1364
 1365        # DuckDB connexion
 1366        if connexion_format in ["duckdb"]:
 1367
 1368            # Database already exists
 1369            if self.input_format in ["db", "duckdb"]:
 1370
 1371                if connexion_format in ["duckdb"]:
 1372                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1373                else:
 1374                    log.error(
 1375                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1376                    )
 1377                    raise ValueError(
 1378                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1379                    )
 1380
 1381            # Load from existing database format
 1382            else:
 1383
 1384                try:
 1385                    # Create Table or View
 1386                    database = Database(database=self.input)
 1387                    sql_from = database.get_sql_from(sample_size=sample_size)
 1388
 1389                    if access in ["RO"]:
 1390                        sql_load = (
 1391                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1392                        )
 1393                    else:
 1394                        sql_load = (
 1395                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1396                        )
 1397                    self.conn.execute(sql_load)
 1398
 1399                except:
 1400                    # Format not available
 1401                    log.error(f"Input file format '{self.input_format}' not available")
 1402                    raise ValueError(
 1403                        f"Input file format '{self.input_format}' not available"
 1404                    )
 1405
 1406        # SQLite connexion
 1407        elif connexion_format in ["sqlite"] and input_format in [
 1408            "vcf",
 1409            "tsv",
 1410            "csv",
 1411            "psv",
 1412        ]:
 1413
 1414            # Main structure
 1415            structure = {
 1416                "#CHROM": "VARCHAR",
 1417                "POS": "INTEGER",
 1418                "ID": "VARCHAR",
 1419                "REF": "VARCHAR",
 1420                "ALT": "VARCHAR",
 1421                "QUAL": "VARCHAR",
 1422                "FILTER": "VARCHAR",
 1423                "INFO": "VARCHAR",
 1424            }
 1425
 1426            # Strcuture with samples
 1427            structure_complete = structure
 1428            if self.get_header_sample_list():
 1429                structure["FORMAT"] = "VARCHAR"
 1430                for sample in self.get_header_sample_list():
 1431                    structure_complete[sample] = "VARCHAR"
 1432
 1433            # Columns list for create and insert
 1434            sql_create_table_columns = []
 1435            sql_create_table_columns_list = []
 1436            for column in structure_complete:
 1437                column_type = structure_complete[column]
 1438                sql_create_table_columns.append(
 1439                    f'"{column}" {column_type} default NULL'
 1440                )
 1441                sql_create_table_columns_list.append(f'"{column}"')
 1442
 1443            # Create database
 1444            log.debug(f"Create Table {table_variants}")
 1445            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1446            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1447            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1448            self.conn.execute(sql_create_table)
 1449
 1450            # chunksize define length of file chunk load file
 1451            chunksize = 100000
 1452
 1453            # delimiter
 1454            delimiter = file_format_delimiters.get(input_format, "\t")
 1455
 1456            # Load the input file
 1457            with open(self.input, "rt") as input_file:
 1458
 1459                # Use the appropriate file handler based on the input format
 1460                if input_compressed:
 1461                    input_file = bgzf.open(self.input, "rt")
 1462                if input_format in ["vcf"]:
 1463                    header_len = self.get_header_length()
 1464                else:
 1465                    header_len = 0
 1466
 1467                # Insert the file contents into a table
 1468                self.insert_file_to_table(
 1469                    input_file,
 1470                    columns=sql_create_table_columns_list_sql,
 1471                    header_len=header_len,
 1472                    sep=delimiter,
 1473                    chunksize=chunksize,
 1474                )
 1475
 1476        else:
 1477            log.error(
 1478                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1479            )
 1480            raise ValueError(
 1481                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1482            )
 1483
 1484        # Explode INFOS fields into table fields
 1485        if self.get_explode_infos():
 1486            self.explode_infos(
 1487                prefix=self.get_explode_infos_prefix(),
 1488                fields=self.get_explode_infos_fields(),
 1489                force=True,
 1490            )
 1491
 1492        # Create index after insertion
 1493        self.create_indexes()
 1494
 1495    def get_explode_infos(self) -> bool:
 1496        """
 1497        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1498        to False if it is not set.
 1499        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1500        value. If the parameter is not present, it will return False.
 1501        """
 1502
 1503        return self.get_param().get("explode", {}).get("explode_infos", False)
 1504
 1505    def get_explode_infos_fields(
 1506        self,
 1507        explode_infos_fields: str = None,
 1508        remove_fields_not_in_header: bool = False,
 1509    ) -> list:
 1510        """
 1511        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1512        the input parameter `explode_infos_fields`.
 1513
 1514        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1515        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1516        comma-separated list of field names to explode
 1517        :type explode_infos_fields: str
 1518        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1519        flag that determines whether to remove fields that are not present in the header. If it is set
 1520        to `True`, any field that is not in the header will be excluded from the list of exploded
 1521        information fields. If it is set to `, defaults to False
 1522        :type remove_fields_not_in_header: bool (optional)
 1523        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1524        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1525        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1526        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1527        splitting the string by commas.
 1528        """
 1529
 1530        # If no fields, get it in param
 1531        if not explode_infos_fields:
 1532            explode_infos_fields = (
 1533                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1534            )
 1535
 1536        # If no fields, defined as all fields in header using keyword
 1537        if not explode_infos_fields:
 1538            explode_infos_fields = "*"
 1539
 1540        # If fields list not empty
 1541        if explode_infos_fields:
 1542
 1543            # Input fields list
 1544            if isinstance(explode_infos_fields, str):
 1545                fields_input = explode_infos_fields.split(",")
 1546            elif isinstance(explode_infos_fields, list):
 1547                fields_input = explode_infos_fields
 1548            else:
 1549                fields_input = []
 1550
 1551            # Fields list without * keyword
 1552            fields_without_all = fields_input.copy()
 1553            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1554                fields_without_all.remove("*")
 1555
 1556            # Fields in header
 1557            fields_in_header = sorted(list(set(self.get_header().infos)))
 1558
 1559            # Construct list of fields
 1560            fields_output = []
 1561            for field in fields_input:
 1562
 1563                # Strip field
 1564                field = field.strip()
 1565
 1566                # format keyword * in regex
 1567                if field.upper() in ["*"]:
 1568                    field = ".*"
 1569
 1570                # Find all fields with pattern
 1571                r = re.compile(field)
 1572                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1573
 1574                # Remove fields input from search
 1575                if field in fields_search:
 1576                    fields_search = [field]
 1577                elif fields_search != [field]:
 1578                    fields_search = sorted(
 1579                        list(set(fields_search).difference(fields_input))
 1580                    )
 1581
 1582                # If field is not in header (avoid not well formatted header)
 1583                if not fields_search and not remove_fields_not_in_header:
 1584                    fields_search = [field]
 1585
 1586                # Add found fields
 1587                for new_field in fields_search:
 1588                    # Add field, if not already exists, and if it is in header (if asked)
 1589                    if (
 1590                        new_field not in fields_output
 1591                        and (
 1592                            not remove_fields_not_in_header
 1593                            or new_field in fields_in_header
 1594                        )
 1595                        and new_field not in [".*"]
 1596                    ):
 1597                        fields_output.append(new_field)
 1598
 1599            return fields_output
 1600
 1601        else:
 1602
 1603            return []
 1604
 1605    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1606        """
 1607        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1608        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1609        not provided.
 1610
 1611        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1612        prefix to be used for exploding or expanding information
 1613        :type explode_infos_prefix: str
 1614        :return: the value of the variable `explode_infos_prefix`.
 1615        """
 1616
 1617        if not explode_infos_prefix:
 1618            explode_infos_prefix = (
 1619                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1620            )
 1621
 1622        return explode_infos_prefix
 1623
 1624    def add_column(
 1625        self,
 1626        table_name,
 1627        column_name,
 1628        column_type,
 1629        default_value=None,
 1630        drop: bool = False,
 1631    ) -> dict:
 1632        """
 1633        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1634        doesn't already exist.
 1635
 1636        :param table_name: The name of the table to which you want to add a column
 1637        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1638        to the table
 1639        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1640        want to add to the table. It should be a string that represents the desired data type, such as
 1641        "INTEGER", "TEXT", "REAL", etc
 1642        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1643        default value for the newly added column. If a default value is provided, it will be assigned to
 1644        the column for any existing rows that do not have a value for that column
 1645        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1646        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1647        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1648        to False
 1649        :type drop: bool (optional)
 1650        :return: a boolean value indicating whether the column was successfully added to the table.
 1651        """
 1652
 1653        # added
 1654        added = False
 1655        dropped = False
 1656
 1657        # Check if the column already exists in the table
 1658        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1659        columns = self.get_query_to_df(query).columns.tolist()
 1660        if column_name.upper() in [c.upper() for c in columns]:
 1661            log.debug(
 1662                f"The {column_name} column already exists in the {table_name} table"
 1663            )
 1664            if drop:
 1665                self.drop_column(table_name=table_name, column_name=column_name)
 1666                dropped = True
 1667            else:
 1668                return None
 1669        else:
 1670            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1671
 1672        # Add column in table
 1673        add_column_query = (
 1674            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1675        )
 1676        if default_value is not None:
 1677            add_column_query += f" DEFAULT {default_value}"
 1678        self.execute_query(add_column_query)
 1679        added = not dropped
 1680        log.debug(
 1681            f"The {column_name} column was successfully added to the {table_name} table"
 1682        )
 1683
 1684        if added:
 1685            added_column = {
 1686                "table_name": table_name,
 1687                "column_name": column_name,
 1688                "column_type": column_type,
 1689                "default_value": default_value,
 1690            }
 1691        else:
 1692            added_column = None
 1693
 1694        return added_column
 1695
 1696    def drop_column(
 1697        self, column: dict = None, table_name: str = None, column_name: str = None
 1698    ) -> bool:
 1699        """
 1700        The `drop_column` function drops a specified column from a given table in a database and returns
 1701        True if the column was successfully dropped, and False if the column does not exist in the
 1702        table.
 1703
 1704        :param column: The `column` parameter is a dictionary that contains information about the column
 1705        you want to drop. It has two keys:
 1706        :type column: dict
 1707        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1708        drop a column
 1709        :type table_name: str
 1710        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1711        from the table
 1712        :type column_name: str
 1713        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1714        and False if the column does not exist in the table.
 1715        """
 1716
 1717        # Find column infos
 1718        if column:
 1719            if isinstance(column, dict):
 1720                table_name = column.get("table_name", None)
 1721                column_name = column.get("column_name", None)
 1722            elif isinstance(column, str):
 1723                table_name = self.get_table_variants()
 1724                column_name = column
 1725            else:
 1726                table_name = None
 1727                column_name = None
 1728
 1729        if not table_name and not column_name:
 1730            return False
 1731
 1732        # Removed
 1733        removed = False
 1734
 1735        # Check if the column already exists in the table
 1736        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1737        columns = self.get_query_to_df(query).columns.tolist()
 1738        if column_name in columns:
 1739            log.debug(f"The {column_name} column exists in the {table_name} table")
 1740        else:
 1741            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1742            return False
 1743
 1744        # Add column in table # ALTER TABLE integers DROP k
 1745        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1746        self.execute_query(add_column_query)
 1747        removed = True
 1748        log.debug(
 1749            f"The {column_name} column was successfully dropped to the {table_name} table"
 1750        )
 1751
 1752        return removed
 1753
 1754    def explode_infos(
 1755        self,
 1756        prefix: str = None,
 1757        create_index: bool = False,
 1758        fields: list = None,
 1759        force: bool = False,
 1760        proccess_all_fields_together: bool = False,
 1761        table: str = None,
 1762    ) -> list:
 1763        """
 1764        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1765        individual columns, returning a list of added columns.
 1766
 1767        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1768        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1769        `self.get_explode_infos_prefix()` as the prefix
 1770        :type prefix: str
 1771        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1772        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1773        `False`, indexes will not be created. The default value is `False`, defaults to False
 1774        :type create_index: bool (optional)
 1775        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1776        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1777        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1778        a list to the `
 1779        :type fields: list
 1780        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1781        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1782        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1783        defaults to False
 1784        :type force: bool (optional)
 1785        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1786        flag that determines whether to process all the INFO fields together or individually. If set to
 1787        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1788        be processed individually. The default value is, defaults to False
 1789        :type proccess_all_fields_together: bool (optional)
 1790        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1791        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1792        a value for the `table` parameter, the function will use that table name. If the `table`
 1793        parameter is
 1794        :type table: str
 1795        :return: The `explode_infos` function returns a list of added columns.
 1796        """
 1797
 1798        # drop indexes
 1799        self.drop_indexes()
 1800
 1801        # connexion format
 1802        connexion_format = self.get_connexion_format()
 1803
 1804        # Access
 1805        access = self.get_config().get("access", None)
 1806
 1807        # Added columns
 1808        added_columns = []
 1809
 1810        if access not in ["RO"]:
 1811
 1812            # prefix
 1813            if prefix in [None, True] or not isinstance(prefix, str):
 1814                if self.get_explode_infos_prefix() not in [None, True]:
 1815                    prefix = self.get_explode_infos_prefix()
 1816                else:
 1817                    prefix = "INFO/"
 1818
 1819            # table variants
 1820            if table is not None:
 1821                table_variants = table
 1822            else:
 1823                table_variants = self.get_table_variants(clause="select")
 1824
 1825            # extra infos
 1826            try:
 1827                extra_infos = self.get_extra_infos()
 1828            except:
 1829                extra_infos = []
 1830
 1831            # Header infos
 1832            header_infos = self.get_header().infos
 1833
 1834            log.debug(
 1835                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1836            )
 1837
 1838            sql_info_alter_table_array = []
 1839
 1840            # Info fields to check
 1841            fields_list = list(header_infos)
 1842            if fields:
 1843                fields_list += fields
 1844            fields_list = set(fields_list)
 1845
 1846            # If no fields
 1847            if not fields:
 1848                fields = []
 1849
 1850            # Translate fields if patterns
 1851            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1852
 1853            for info in fields:
 1854
 1855                info_id_sql = prefix + info
 1856
 1857                if (
 1858                    info in fields_list
 1859                    or prefix + info in fields_list
 1860                    or info in extra_infos
 1861                ):
 1862
 1863                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1864
 1865                    if info in header_infos:
 1866                        info_type = header_infos[info].type
 1867                        info_num = header_infos[info].num
 1868                    else:
 1869                        info_type = "String"
 1870                        info_num = 0
 1871
 1872                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1873                    if info_num != 1:
 1874                        type_sql = "VARCHAR"
 1875
 1876                    # Add field
 1877                    added_column = self.add_column(
 1878                        table_name=table_variants,
 1879                        column_name=info_id_sql,
 1880                        column_type=type_sql,
 1881                        default_value="null",
 1882                        drop=force,
 1883                    )
 1884
 1885                    if added_column:
 1886                        added_columns.append(added_column)
 1887
 1888                    if added_column or force:
 1889
 1890                        # add field to index
 1891                        self.index_additionnal_fields.append(info_id_sql)
 1892
 1893                        # Update field array
 1894                        if connexion_format in ["duckdb"]:
 1895                            update_info_field = f"""
 1896                            "{info_id_sql}" =
 1897                                CASE
 1898                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1899                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1900                                END
 1901                            """
 1902                        elif connexion_format in ["sqlite"]:
 1903                            update_info_field = f"""
 1904                                "{info_id_sql}" =
 1905                                    CASE
 1906                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1907                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1908                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1909                                    END
 1910                            """
 1911
 1912                        sql_info_alter_table_array.append(update_info_field)
 1913
 1914            if sql_info_alter_table_array:
 1915
 1916                # By chromosomes
 1917                try:
 1918                    chromosomes_list = list(
 1919                        self.get_query_to_df(
 1920                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1921                        )["#CHROM"]
 1922                    )
 1923                except:
 1924                    chromosomes_list = [None]
 1925
 1926                for chrom in chromosomes_list:
 1927                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1928
 1929                    # Where clause
 1930                    where_clause = ""
 1931                    if chrom and len(chromosomes_list) > 1:
 1932                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1933
 1934                    # Update table
 1935                    if proccess_all_fields_together:
 1936                        sql_info_alter_table_array_join = ", ".join(
 1937                            sql_info_alter_table_array
 1938                        )
 1939                        if sql_info_alter_table_array_join:
 1940                            sql_info_alter_table = f"""
 1941                                UPDATE {table_variants}
 1942                                SET {sql_info_alter_table_array_join}
 1943                                {where_clause}
 1944                                """
 1945                            log.debug(
 1946                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1947                            )
 1948                            # log.debug(sql_info_alter_table)
 1949                            self.conn.execute(sql_info_alter_table)
 1950                    else:
 1951                        sql_info_alter_num = 0
 1952                        for sql_info_alter in sql_info_alter_table_array:
 1953                            sql_info_alter_num += 1
 1954                            sql_info_alter_table = f"""
 1955                                UPDATE {table_variants}
 1956                                SET {sql_info_alter}
 1957                                {where_clause}
 1958                                """
 1959                            log.debug(
 1960                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1961                            )
 1962                            # log.debug(sql_info_alter_table)
 1963                            self.conn.execute(sql_info_alter_table)
 1964
 1965        # create indexes
 1966        if create_index:
 1967            self.create_indexes()
 1968
 1969        return added_columns
 1970
 1971    def create_indexes(self) -> None:
 1972        """
 1973        Create indexes on the table after insertion
 1974        """
 1975
 1976        # Access
 1977        access = self.get_config().get("access", None)
 1978
 1979        # get table variants
 1980        table_variants = self.get_table_variants("FROM")
 1981
 1982        if self.get_indexing() and access not in ["RO"]:
 1983            # Create index
 1984            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1985            self.conn.execute(sql_create_table_index)
 1986            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1987            self.conn.execute(sql_create_table_index)
 1988            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1989            self.conn.execute(sql_create_table_index)
 1990            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1991            self.conn.execute(sql_create_table_index)
 1992            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1993            self.conn.execute(sql_create_table_index)
 1994            for field in self.index_additionnal_fields:
 1995                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1996                self.conn.execute(sql_create_table_index)
 1997
 1998    def drop_indexes(self) -> None:
 1999        """
 2000        Create indexes on the table after insertion
 2001        """
 2002
 2003        # Access
 2004        access = self.get_config().get("access", None)
 2005
 2006        # get table variants
 2007        table_variants = self.get_table_variants("FROM")
 2008
 2009        # Get database format
 2010        connexion_format = self.get_connexion_format()
 2011
 2012        if access not in ["RO"]:
 2013            if connexion_format in ["duckdb"]:
 2014                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2015            elif connexion_format in ["sqlite"]:
 2016                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2017
 2018            list_indexes = self.conn.execute(sql_list_indexes)
 2019            index_names = [row[0] for row in list_indexes.fetchall()]
 2020            for index in index_names:
 2021                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2022                self.conn.execute(sql_drop_table_index)
 2023
 2024    def read_vcf_header(self, f) -> list:
 2025        """
 2026        It reads the header of a VCF file and returns a list of the header lines
 2027
 2028        :param f: the file object
 2029        :return: The header lines of the VCF file.
 2030        """
 2031
 2032        header_list = []
 2033        for line in f:
 2034            header_list.append(line)
 2035            if line.startswith("#CHROM"):
 2036                break
 2037        return header_list
 2038
 2039    def read_vcf_header_file(self, file: str = None) -> list:
 2040        """
 2041        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2042        uncompressed files.
 2043
 2044        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2045        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2046        default to `None`
 2047        :type file: str
 2048        :return: The function `read_vcf_header_file` returns a list.
 2049        """
 2050
 2051        if self.get_input_compressed(input_file=file):
 2052            with bgzf.open(file, "rt") as f:
 2053                return self.read_vcf_header(f=f)
 2054        else:
 2055            with open(file, "rt") as f:
 2056                return self.read_vcf_header(f=f)
 2057
 2058    def execute_query(self, query: str):
 2059        """
 2060        It takes a query as an argument, executes it, and returns the results
 2061
 2062        :param query: The query to be executed
 2063        :return: The result of the query is being returned.
 2064        """
 2065        if query:
 2066            return self.conn.execute(query)  # .fetchall()
 2067        else:
 2068            return None
 2069
 2070    def export_output(
 2071        self,
 2072        output_file: str | None = None,
 2073        output_header: str | None = None,
 2074        export_header: bool = True,
 2075        query: str | None = None,
 2076        parquet_partitions: list | None = None,
 2077        chunk_size: int | None = None,
 2078        threads: int | None = None,
 2079        sort: bool = False,
 2080        index: bool = False,
 2081        order_by: str | None = None,
 2082    ) -> bool:
 2083        """
 2084        The `export_output` function exports data from a VCF file to a specified output file in various
 2085        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2086
 2087        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2088        output file to be generated by the function. This is where the exported data will be saved
 2089        :type output_file: str
 2090        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2091        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2092        header will be exported to a file with the same name as the `output_file` parameter, but with
 2093        the extension "
 2094        :type output_header: str
 2095        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2096        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2097        True, the header will be exported to a file. If `export_header` is False, the header will not
 2098        be, defaults to True, if output format is not VCF
 2099        :type export_header: bool (optional)
 2100        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2101        select specific data from the VCF file before exporting it. If provided, only the data that
 2102        matches the query will be exported
 2103        :type query: str
 2104        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2105        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2106        organize data in a hierarchical directory structure based on the values of one or more columns.
 2107        This can improve query performance when working with large datasets
 2108        :type parquet_partitions: list
 2109        :param chunk_size: The `chunk_size` parameter specifies the number of
 2110        records in batch when exporting data in Parquet format. This parameter is used for
 2111        partitioning the Parquet file into multiple files.
 2112        :type chunk_size: int
 2113        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2114        threads to be used during the export process. It determines the level of parallelism and can
 2115        improve the performance of the export operation. If not provided, the function will use the
 2116        default number of threads
 2117        :type threads: int
 2118        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2119        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2120        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2121        False
 2122        :type sort: bool (optional)
 2123        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2124        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2125        no index will be created. The default value is False, defaults to False
 2126        :type index: bool (optional)
 2127        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2128        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2129        :type order_by: str
 2130        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2131        None if it doesn't.
 2132        """
 2133
 2134        # Log
 2135        log.info("Exporting...")
 2136
 2137        # Full path
 2138        output_file = full_path(output_file)
 2139        output_header = full_path(output_header)
 2140
 2141        # Config
 2142        config = self.get_config()
 2143
 2144        # Param
 2145        param = self.get_param()
 2146
 2147        # Tmp files to remove
 2148        tmp_to_remove = []
 2149
 2150        # If no output, get it
 2151        if not output_file:
 2152            output_file = self.get_output()
 2153
 2154        # If not threads
 2155        if not threads:
 2156            threads = self.get_threads()
 2157
 2158        # Auto header name with extension
 2159        if export_header or output_header:
 2160            if not output_header:
 2161                output_header = f"{output_file}.hdr"
 2162            # Export header
 2163            self.export_header(output_file=output_file)
 2164
 2165        # Switch off export header if VCF output
 2166        output_file_type = get_file_format(output_file)
 2167        if output_file_type in ["vcf"]:
 2168            export_header = False
 2169            tmp_to_remove.append(output_header)
 2170
 2171        # Chunk size
 2172        if not chunk_size:
 2173            chunk_size = config.get("chunk_size", None)
 2174
 2175        # Parquet partition
 2176        if not parquet_partitions:
 2177            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2178        if parquet_partitions and isinstance(parquet_partitions, str):
 2179            parquet_partitions = parquet_partitions.split(",")
 2180
 2181        # Order by
 2182        if not order_by:
 2183            order_by = param.get("export", {}).get("order_by", "")
 2184
 2185        # Header in output
 2186        header_in_output = param.get("export", {}).get("include_header", False)
 2187
 2188        # Database
 2189        database_source = self.get_connexion()
 2190
 2191        # Connexion format
 2192        connexion_format = self.get_connexion_format()
 2193
 2194        # Explode infos
 2195        if self.get_explode_infos():
 2196            self.explode_infos(
 2197                prefix=self.get_explode_infos_prefix(),
 2198                fields=self.get_explode_infos_fields(),
 2199                force=False,
 2200            )
 2201
 2202        # if connexion_format in ["sqlite"] or query:
 2203        if connexion_format in ["sqlite"]:
 2204
 2205            # Export in Parquet
 2206            random_tmp = "".join(
 2207                random.choice(string.ascii_lowercase) for i in range(10)
 2208            )
 2209            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2210            tmp_to_remove.append(database_source)
 2211
 2212            # Table Variants
 2213            table_variants = self.get_table_variants()
 2214
 2215            # Create export query
 2216            sql_query_export_subquery = f"""
 2217                SELECT * FROM {table_variants}
 2218                """
 2219
 2220            # Write source file
 2221            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2222
 2223        # Create database
 2224        database = Database(
 2225            database=database_source,
 2226            table="variants",
 2227            header_file=output_header,
 2228            conn_config=self.get_connexion_config(),
 2229        )
 2230
 2231        # Existing colomns header
 2232        existing_columns_header = database.get_header_columns_from_database()
 2233
 2234        # Sample list
 2235        get_samples = self.get_samples()
 2236        get_samples_check = self.get_samples_check()
 2237        samples_force = get_samples is not None
 2238        sample_list = self.get_header_sample_list(
 2239            check=get_samples_check, samples=get_samples, samples_force=samples_force
 2240        )
 2241
 2242        # Export file
 2243        database.export(
 2244            output_database=output_file,
 2245            output_header=output_header,
 2246            existing_columns_header=existing_columns_header,
 2247            parquet_partitions=parquet_partitions,
 2248            chunk_size=chunk_size,
 2249            threads=threads,
 2250            sort=sort,
 2251            index=index,
 2252            header_in_output=header_in_output,
 2253            order_by=order_by,
 2254            query=query,
 2255            export_header=export_header,
 2256            sample_list=sample_list,
 2257        )
 2258
 2259        # Remove
 2260        remove_if_exists(tmp_to_remove)
 2261
 2262        return (os.path.exists(output_file) or None) and (
 2263            os.path.exists(output_file) or None
 2264        )
 2265
 2266    def get_extra_infos(self, table: str = None) -> list:
 2267        """
 2268        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2269        in the header.
 2270
 2271        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2272        name of the table from which you want to retrieve the extra columns that are not present in the
 2273        header. If the `table` parameter is not provided when calling the function, it will default to
 2274        using the variants
 2275        :type table: str
 2276        :return: A list of columns that are in the specified table but not in the header of the table.
 2277        """
 2278
 2279        header_columns = []
 2280
 2281        if not table:
 2282            table = self.get_table_variants(clause="from")
 2283            header_columns = self.get_header_columns()
 2284
 2285        # Check all columns in the database
 2286        query = f""" SELECT * FROM {table} LIMIT 1 """
 2287        log.debug(f"query {query}")
 2288        table_columns = self.get_query_to_df(query).columns.tolist()
 2289        extra_columns = []
 2290
 2291        # Construct extra infos (not in header)
 2292        for column in table_columns:
 2293            if column not in header_columns:
 2294                extra_columns.append(column)
 2295
 2296        return extra_columns
 2297
 2298    def get_extra_infos_sql(self, table: str = None) -> str:
 2299        """
 2300        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2301        by double quotes
 2302
 2303        :param table: The name of the table to get the extra infos from. If None, the default table is
 2304        used
 2305        :type table: str
 2306        :return: A string of the extra infos
 2307        """
 2308
 2309        return ", ".join(
 2310            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2311        )
 2312
 2313    def export_header(
 2314        self,
 2315        header_name: str = None,
 2316        output_file: str = None,
 2317        output_file_ext: str = ".hdr",
 2318        clean_header: bool = True,
 2319        remove_chrom_line: bool = False,
 2320    ) -> str:
 2321        """
 2322        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2323        specified options, and writes it to a new file.
 2324
 2325        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2326        this parameter is not specified, the header will be written to the output file
 2327        :type header_name: str
 2328        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2329        specify the name of the output file where the header will be written. If this parameter is not
 2330        provided, the header will be written to a temporary file
 2331        :type output_file: str
 2332        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2333        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2334        if not specified by the user. This extension will be appended to the `output_file` name to
 2335        create the final, defaults to .hdr
 2336        :type output_file_ext: str (optional)
 2337        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2338        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2339        `True`, the function will clean the header by modifying certain lines based on a specific
 2340        pattern. If `clean_header`, defaults to True
 2341        :type clean_header: bool (optional)
 2342        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2343        boolean flag that determines whether the #CHROM line should be removed from the header before
 2344        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2345        defaults to False
 2346        :type remove_chrom_line: bool (optional)
 2347        :return: The function `export_header` returns the name of the temporary header file that is
 2348        created.
 2349        """
 2350
 2351        if not header_name and not output_file:
 2352            output_file = self.get_output()
 2353
 2354        if self.get_header():
 2355
 2356            # Get header object
 2357            header_obj = self.get_header()
 2358
 2359            # Create database
 2360            db_for_header = Database(database=self.get_input())
 2361
 2362            # Get real columns in the file
 2363            db_header_columns = db_for_header.get_columns()
 2364
 2365            with tempfile.TemporaryDirectory() as tmpdir:
 2366
 2367                # Write header file
 2368                header_file_tmp = os.path.join(tmpdir, "header")
 2369                f = open(header_file_tmp, "w")
 2370                vcf.Writer(f, header_obj)
 2371                f.close()
 2372
 2373                # Replace #CHROM line with rel columns
 2374                header_list = db_for_header.read_header_file(
 2375                    header_file=header_file_tmp
 2376                )
 2377                header_list[-1] = "\t".join(db_header_columns)
 2378
 2379                # Remove CHROM line
 2380                if remove_chrom_line:
 2381                    header_list.pop()
 2382
 2383                # Clean header
 2384                if clean_header:
 2385                    header_list_clean = []
 2386                    for head in header_list:
 2387                        # Clean head for malformed header
 2388                        head_clean = head
 2389                        head_clean = re.subn(
 2390                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2391                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2392                            head_clean,
 2393                            2,
 2394                        )[0]
 2395                        # Write header
 2396                        header_list_clean.append(head_clean)
 2397                    header_list = header_list_clean
 2398
 2399            tmp_header_name = output_file + output_file_ext
 2400
 2401            f = open(tmp_header_name, "w")
 2402            for line in header_list:
 2403                f.write(line)
 2404            f.close()
 2405
 2406        return tmp_header_name
 2407
 2408    def export_variant_vcf(
 2409        self,
 2410        vcf_file,
 2411        remove_info: bool = False,
 2412        add_samples: bool = True,
 2413        list_samples: list = [],
 2414        where_clause: str = "",
 2415        index: bool = False,
 2416        threads: int | None = None,
 2417    ) -> bool | None:
 2418        """
 2419        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2420        remove INFO field, add samples, and control compression and indexing.
 2421
 2422        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2423        written to. It is the output file that will contain the filtered VCF data based on the specified
 2424        parameters
 2425        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2426        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2427        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2428        in, defaults to False
 2429        :type remove_info: bool (optional)
 2430        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2431        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2432        If set to False, the samples will be removed. The default value is True, defaults to True
 2433        :type add_samples: bool (optional)
 2434        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2435        in the output VCF file. By default, all samples will be included. If you provide a list of
 2436        samples, only those samples will be included in the output file
 2437        :type list_samples: list
 2438        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2439        determines whether or not to create an index for the output VCF file. If `index` is set to
 2440        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2441        :type index: bool (optional)
 2442        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2443        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2444        will be used during the export process. More threads can potentially speed up the export process
 2445        by utilizing multiple cores of the processor. If
 2446        :type threads: int | None
 2447        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2448        method with various parameters including the output file, query, threads, sort flag, and index
 2449        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2450        specified parameters and configurations provided in the `export_variant_vcf` function.
 2451        """
 2452
 2453        # Config
 2454        config = self.get_config()
 2455
 2456        # Extract VCF
 2457        log.debug("Export VCF...")
 2458
 2459        # Table variants
 2460        table_variants = self.get_table_variants()
 2461
 2462        # Threads
 2463        if not threads:
 2464            threads = self.get_threads()
 2465
 2466        # Info fields
 2467        if remove_info:
 2468            if not isinstance(remove_info, str):
 2469                remove_info = "."
 2470            info_field = f"""'{remove_info}' as INFO"""
 2471        else:
 2472            info_field = "INFO"
 2473
 2474        # Samples fields
 2475        if add_samples:
 2476            if not list_samples:
 2477                list_samples = self.get_header_sample_list()
 2478            if list_samples:
 2479                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2480            else:
 2481                samples_fields = ""
 2482            log.debug(f"samples_fields: {samples_fields}")
 2483        else:
 2484            samples_fields = ""
 2485
 2486        # Where clause
 2487        if where_clause is None:
 2488            where_clause = ""
 2489
 2490        # Variants
 2491        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2492        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2493        log.debug(f"sql_query_select={sql_query_select}")
 2494
 2495        return self.export_output(
 2496            output_file=vcf_file,
 2497            output_header=None,
 2498            export_header=True,
 2499            query=sql_query_select,
 2500            parquet_partitions=None,
 2501            chunk_size=config.get("chunk_size", None),
 2502            threads=threads,
 2503            sort=True,
 2504            index=index,
 2505            order_by=None,
 2506        )
 2507
 2508    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2509        """
 2510        It takes a list of commands and runs them in parallel using the number of threads specified
 2511
 2512        :param commands: A list of commands to run
 2513        :param threads: The number of threads to use, defaults to 1 (optional)
 2514        """
 2515
 2516        run_parallel_commands(commands, threads)
 2517
 2518    def get_threads(self, default: int = 1) -> int:
 2519        """
 2520        This function returns the number of threads to use for a job, with a default value of 1 if not
 2521        specified.
 2522
 2523        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2524        default number of threads to use if no specific value is provided. If no value is provided for
 2525        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2526        used, defaults to 1
 2527        :type default: int (optional)
 2528        :return: the number of threads to use for the current job.
 2529        """
 2530
 2531        # Config
 2532        config = self.get_config()
 2533
 2534        # Param
 2535        param = self.get_param()
 2536
 2537        # Input threads
 2538        input_thread = param.get("threads", config.get("threads", None))
 2539
 2540        # Check threads
 2541        if not input_thread:
 2542            threads = default
 2543        elif int(input_thread) <= 0:
 2544            threads = os.cpu_count()
 2545        else:
 2546            threads = int(input_thread)
 2547        return threads
 2548
 2549    def get_memory(self, default: str = None) -> str:
 2550        """
 2551        This function retrieves the memory value from parameters or configuration with a default value
 2552        if not found.
 2553
 2554        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2555        default value is used as a fallback in case the `memory` parameter is not provided in the
 2556        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2557        the function
 2558        :type default: str
 2559        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2560        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2561        return the default value provided as an argument to the function.
 2562        """
 2563
 2564        # Config
 2565        config = self.get_config()
 2566
 2567        # Param
 2568        param = self.get_param()
 2569
 2570        # Input threads
 2571        input_memory = param.get("memory", config.get("memory", None))
 2572
 2573        # Check threads
 2574        if input_memory:
 2575            memory = input_memory
 2576        else:
 2577            memory = default
 2578
 2579        return memory
 2580
 2581    def update_from_vcf(self, vcf_file: str) -> None:
 2582        """
 2583        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2584
 2585        :param vcf_file: the path to the VCF file
 2586        """
 2587
 2588        connexion_format = self.get_connexion_format()
 2589
 2590        if connexion_format in ["duckdb"]:
 2591            self.update_from_vcf_duckdb(vcf_file)
 2592        elif connexion_format in ["sqlite"]:
 2593            self.update_from_vcf_sqlite(vcf_file)
 2594
 2595    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2596        """
 2597        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2598        INFO column of the VCF file
 2599
 2600        :param vcf_file: the path to the VCF file
 2601        """
 2602
 2603        # varaints table
 2604        table_variants = self.get_table_variants()
 2605
 2606        # Loading VCF into temporaire table
 2607        skip = self.get_header_length(file=vcf_file)
 2608        vcf_df = pd.read_csv(
 2609            vcf_file,
 2610            sep="\t",
 2611            engine="c",
 2612            skiprows=skip,
 2613            header=0,
 2614            low_memory=False,
 2615        )
 2616        sql_query_update = f"""
 2617        UPDATE {table_variants} as table_variants
 2618            SET INFO = concat(
 2619                            CASE
 2620                                WHEN INFO NOT IN ('', '.')
 2621                                THEN INFO
 2622                                ELSE ''
 2623                            END,
 2624                            (
 2625                                SELECT 
 2626                                    concat(
 2627                                        CASE
 2628                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2629                                            THEN ';'
 2630                                            ELSE ''
 2631                                        END
 2632                                        ,
 2633                                        CASE
 2634                                            WHEN table_parquet.INFO NOT IN ('','.')
 2635                                            THEN table_parquet.INFO
 2636                                            ELSE ''
 2637                                        END
 2638                                    )
 2639                                FROM vcf_df as table_parquet
 2640                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2641                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2642                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2643                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2644                                        AND table_parquet.INFO NOT IN ('','.')
 2645                            )
 2646                        )
 2647            ;
 2648            """
 2649        self.conn.execute(sql_query_update)
 2650
 2651    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2652        """
 2653        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2654        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2655        table
 2656
 2657        :param vcf_file: The path to the VCF file you want to update the database with
 2658        """
 2659
 2660        # Create a temporary table for the VCF
 2661        table_vcf = "tmp_vcf"
 2662        sql_create = (
 2663            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2664        )
 2665        self.conn.execute(sql_create)
 2666
 2667        # Loading VCF into temporaire table
 2668        vcf_df = pd.read_csv(
 2669            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2670        )
 2671        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2672        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2673
 2674        # Update table 'variants' with VCF data
 2675        # warning: CONCAT as || operator
 2676        sql_query_update = f"""
 2677            UPDATE variants as table_variants
 2678            SET INFO = CASE
 2679                            WHEN INFO NOT IN ('', '.')
 2680                            THEN INFO
 2681                            ELSE ''
 2682                        END ||
 2683                        (
 2684                        SELECT 
 2685                            CASE 
 2686                                WHEN table_variants.INFO NOT IN ('','.') 
 2687                                    AND table_vcf.INFO NOT IN ('','.')  
 2688                                THEN ';' 
 2689                                ELSE '' 
 2690                            END || 
 2691                            CASE 
 2692                                WHEN table_vcf.INFO NOT IN ('','.') 
 2693                                THEN table_vcf.INFO 
 2694                                ELSE '' 
 2695                            END
 2696                        FROM {table_vcf} as table_vcf
 2697                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2698                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2699                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2700                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2701                        )
 2702        """
 2703        self.conn.execute(sql_query_update)
 2704
 2705        # Drop temporary table
 2706        sql_drop = f"DROP TABLE {table_vcf}"
 2707        self.conn.execute(sql_drop)
 2708
 2709    def drop_variants_table(self) -> None:
 2710        """
 2711        > This function drops the variants table
 2712        """
 2713
 2714        table_variants = self.get_table_variants()
 2715        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2716        self.conn.execute(sql_table_variants)
 2717
 2718    def set_variant_id(
 2719        self, variant_id_column: str = "variant_id", force: bool = None
 2720    ) -> str:
 2721        """
 2722        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2723        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2724
 2725        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2726        to variant_id
 2727        :type variant_id_column: str (optional)
 2728        :param force: If True, the variant_id column will be created even if it already exists
 2729        :type force: bool
 2730        :return: The name of the column that contains the variant_id
 2731        """
 2732
 2733        # Assembly
 2734        assembly = self.get_param().get(
 2735            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2736        )
 2737
 2738        # INFO/Tag prefix
 2739        prefix = self.get_explode_infos_prefix()
 2740
 2741        # Explode INFO/SVTYPE
 2742        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2743
 2744        # variants table
 2745        table_variants = self.get_table_variants()
 2746
 2747        # variant_id column
 2748        if not variant_id_column:
 2749            variant_id_column = "variant_id"
 2750
 2751        # Creta variant_id column
 2752        if "variant_id" not in self.get_extra_infos() or force:
 2753
 2754            # Create column
 2755            self.add_column(
 2756                table_name=table_variants,
 2757                column_name=variant_id_column,
 2758                column_type="UBIGINT",
 2759                default_value="0",
 2760            )
 2761
 2762            # Update column
 2763            self.conn.execute(
 2764                f"""
 2765                    UPDATE {table_variants}
 2766                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2767                """
 2768            )
 2769
 2770        # Remove added columns
 2771        for added_column in added_columns:
 2772            self.drop_column(column=added_column)
 2773
 2774        # return variant_id column name
 2775        return variant_id_column
 2776
 2777    def get_variant_id_column(
 2778        self, variant_id_column: str = "variant_id", force: bool = None
 2779    ) -> str:
 2780        """
 2781        This function returns the variant_id column name
 2782
 2783        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2784        defaults to variant_id
 2785        :type variant_id_column: str (optional)
 2786        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2787        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2788        if it is not already set, or if it is set
 2789        :type force: bool
 2790        :return: The variant_id column name.
 2791        """
 2792
 2793        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2794
 2795    ###
 2796    # Annotation
 2797    ###
 2798
 2799    def scan_databases(
 2800        self,
 2801        database_formats: list = ["parquet"],
 2802        database_releases: list = ["current"],
 2803    ) -> dict:
 2804        """
 2805        The function `scan_databases` scans for available databases based on specified formats and
 2806        releases.
 2807
 2808        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2809        of the databases to be scanned. In this case, the accepted format is "parquet"
 2810        :type database_formats: list ["parquet"]
 2811        :param database_releases: The `database_releases` parameter is a list that specifies the
 2812        releases of the databases to be scanned. In the provided function, the default value for
 2813        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2814        databases that are in the "current"
 2815        :type database_releases: list
 2816        :return: The function `scan_databases` returns a dictionary containing information about
 2817        databases that match the specified formats and releases.
 2818        """
 2819
 2820        # Config
 2821        config = self.get_config()
 2822
 2823        # Param
 2824        param = self.get_param()
 2825
 2826        # Param - Assembly
 2827        assembly = param.get("assembly", config.get("assembly", None))
 2828        if not assembly:
 2829            assembly = DEFAULT_ASSEMBLY
 2830            log.warning(f"Default assembly '{assembly}'")
 2831
 2832        # Scan for availabled databases
 2833        log.info(
 2834            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2835        )
 2836        databases_infos_dict = databases_infos(
 2837            database_folder_releases=database_releases,
 2838            database_formats=database_formats,
 2839            assembly=assembly,
 2840            config=config,
 2841        )
 2842        log.info(
 2843            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2844        )
 2845
 2846        return databases_infos_dict
 2847
 2848    def annotation(self) -> None:
 2849        """
 2850        It annotates the VCF file with the annotations specified in the config file.
 2851        """
 2852
 2853        # Config
 2854        config = self.get_config()
 2855
 2856        # Param
 2857        param = self.get_param()
 2858
 2859        # Param - Assembly
 2860        assembly = param.get("assembly", config.get("assembly", None))
 2861        if not assembly:
 2862            assembly = DEFAULT_ASSEMBLY
 2863            log.warning(f"Default assembly '{assembly}'")
 2864
 2865        # annotations databases folders
 2866        annotations_databases = set(
 2867            config.get("folders", {})
 2868            .get("databases", {})
 2869            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2870            + config.get("folders", {})
 2871            .get("databases", {})
 2872            .get("parquet", ["~/howard/databases/parquet/current"])
 2873            + config.get("folders", {})
 2874            .get("databases", {})
 2875            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2876        )
 2877
 2878        # Get param annotations
 2879        if param.get("annotations", None) and isinstance(
 2880            param.get("annotations", None), str
 2881        ):
 2882            log.debug(param.get("annotations", None))
 2883            param_annotation_list = param.get("annotations").split(",")
 2884        else:
 2885            param_annotation_list = []
 2886
 2887        # Each tools param
 2888        if param.get("annotation_parquet", None) != None:
 2889            log.debug(
 2890                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2891            )
 2892            if isinstance(param.get("annotation_parquet", None), list):
 2893                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2894            else:
 2895                param_annotation_list.append(param.get("annotation_parquet"))
 2896        if param.get("annotation_snpsift", None) != None:
 2897            if isinstance(param.get("annotation_snpsift", None), list):
 2898                param_annotation_list.append(
 2899                    "snpsift:"
 2900                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2901                )
 2902            else:
 2903                param_annotation_list.append(
 2904                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2905                )
 2906        if param.get("annotation_snpeff", None) != None:
 2907            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2908        if param.get("annotation_bcftools", None) != None:
 2909            if isinstance(param.get("annotation_bcftools", None), list):
 2910                param_annotation_list.append(
 2911                    "bcftools:"
 2912                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2913                )
 2914            else:
 2915                param_annotation_list.append(
 2916                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2917                )
 2918        if param.get("annotation_annovar", None) != None:
 2919            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2920        if param.get("annotation_exomiser", None) != None:
 2921            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2922        if param.get("annotation_splice", None) != None:
 2923            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2924
 2925        # Merge param annotations list
 2926        param["annotations"] = ",".join(param_annotation_list)
 2927
 2928        # debug
 2929        log.debug(f"param_annotations={param['annotations']}")
 2930
 2931        if param.get("annotations"):
 2932
 2933            # Log
 2934            # log.info("Annotations - Check annotation parameters")
 2935
 2936            if not "annotation" in param:
 2937                param["annotation"] = {}
 2938
 2939            # List of annotations parameters
 2940            annotations_list_input = {}
 2941            if isinstance(param.get("annotations", None), str):
 2942                annotation_file_list = [
 2943                    value for value in param.get("annotations", "").split(",")
 2944                ]
 2945                for annotation_file in annotation_file_list:
 2946                    annotations_list_input[annotation_file] = {"INFO": None}
 2947            else:
 2948                annotations_list_input = param.get("annotations", {})
 2949
 2950            log.info(f"Quick Annotations:")
 2951            for annotation_key in list(annotations_list_input.keys()):
 2952                log.info(f"   {annotation_key}")
 2953
 2954            # List of annotations and associated fields
 2955            annotations_list = {}
 2956
 2957            for annotation_file in annotations_list_input:
 2958
 2959                # Explode annotations if ALL
 2960                if (
 2961                    annotation_file.upper() == "ALL"
 2962                    or annotation_file.upper().startswith("ALL:")
 2963                ):
 2964
 2965                    # check ALL parameters (formats, releases)
 2966                    annotation_file_split = annotation_file.split(":")
 2967                    database_formats = "parquet"
 2968                    database_releases = "current"
 2969                    for annotation_file_option in annotation_file_split[1:]:
 2970                        database_all_options_split = annotation_file_option.split("=")
 2971                        if database_all_options_split[0] == "format":
 2972                            database_formats = database_all_options_split[1].split("+")
 2973                        if database_all_options_split[0] == "release":
 2974                            database_releases = database_all_options_split[1].split("+")
 2975
 2976                    # Scan for availabled databases
 2977                    databases_infos_dict = self.scan_databases(
 2978                        database_formats=database_formats,
 2979                        database_releases=database_releases,
 2980                    )
 2981
 2982                    # Add found databases in annotation parameters
 2983                    for database_infos in databases_infos_dict.keys():
 2984                        annotations_list[database_infos] = {"INFO": None}
 2985
 2986                else:
 2987                    annotations_list[annotation_file] = annotations_list_input[
 2988                        annotation_file
 2989                    ]
 2990
 2991            # Check each databases
 2992            if len(annotations_list):
 2993
 2994                log.info(
 2995                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2996                )
 2997
 2998                for annotation_file in annotations_list:
 2999
 3000                    # Init
 3001                    annotations = annotations_list.get(annotation_file, None)
 3002
 3003                    # Annotation snpEff
 3004                    if annotation_file.startswith("snpeff"):
 3005
 3006                        log.debug(f"Quick Annotation snpEff")
 3007
 3008                        if "snpeff" not in param["annotation"]:
 3009                            param["annotation"]["snpeff"] = {}
 3010
 3011                        if "options" not in param["annotation"]["snpeff"]:
 3012                            param["annotation"]["snpeff"]["options"] = ""
 3013
 3014                        # snpEff options in annotations
 3015                        param["annotation"]["snpeff"]["options"] = "".join(
 3016                            annotation_file.split(":")[1:]
 3017                        )
 3018
 3019                    # Annotation Annovar
 3020                    elif annotation_file.startswith("annovar"):
 3021
 3022                        log.debug(f"Quick Annotation Annovar")
 3023
 3024                        if "annovar" not in param["annotation"]:
 3025                            param["annotation"]["annovar"] = {}
 3026
 3027                        if "annotations" not in param["annotation"]["annovar"]:
 3028                            param["annotation"]["annovar"]["annotations"] = {}
 3029
 3030                        # Options
 3031                        annotation_file_split = annotation_file.split(":")
 3032                        for annotation_file_annotation in annotation_file_split[1:]:
 3033                            if annotation_file_annotation:
 3034                                param["annotation"]["annovar"]["annotations"][
 3035                                    annotation_file_annotation
 3036                                ] = annotations
 3037
 3038                    # Annotation Exomiser
 3039                    elif annotation_file.startswith("exomiser"):
 3040
 3041                        log.debug(f"Quick Annotation Exomiser")
 3042
 3043                        param["annotation"]["exomiser"] = params_string_to_dict(
 3044                            annotation_file
 3045                        )
 3046
 3047                    # Annotation Splice
 3048                    elif annotation_file.startswith("splice"):
 3049
 3050                        log.debug(f"Quick Annotation Splice")
 3051
 3052                        param["annotation"]["splice"] = params_string_to_dict(
 3053                            annotation_file
 3054                        )
 3055
 3056                    # Annotation Parquet or BCFTOOLS
 3057                    else:
 3058
 3059                        # Tools detection
 3060                        if annotation_file.startswith("bcftools:"):
 3061                            annotation_tool_initial = "bcftools"
 3062                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3063                        elif annotation_file.startswith("snpsift:"):
 3064                            annotation_tool_initial = "snpsift"
 3065                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3066                        else:
 3067                            annotation_tool_initial = None
 3068
 3069                        # list of files
 3070                        annotation_file_list = annotation_file.replace("+", ":").split(
 3071                            ":"
 3072                        )
 3073
 3074                        for annotation_file in annotation_file_list:
 3075
 3076                            if annotation_file:
 3077
 3078                                # Annotation tool initial
 3079                                annotation_tool = annotation_tool_initial
 3080
 3081                                # Find file
 3082                                annotation_file_found = None
 3083
 3084                                # Expand user
 3085                                annotation_file = full_path(annotation_file)
 3086
 3087                                if os.path.exists(annotation_file):
 3088                                    annotation_file_found = annotation_file
 3089
 3090                                else:
 3091                                    # Find within assembly folders
 3092                                    for annotations_database in annotations_databases:
 3093                                        found_files = find_all(
 3094                                            annotation_file,
 3095                                            os.path.join(
 3096                                                annotations_database, assembly
 3097                                            ),
 3098                                        )
 3099                                        if len(found_files) > 0:
 3100                                            annotation_file_found = found_files[0]
 3101                                            break
 3102                                    if not annotation_file_found and not assembly:
 3103                                        # Find within folders
 3104                                        for (
 3105                                            annotations_database
 3106                                        ) in annotations_databases:
 3107                                            found_files = find_all(
 3108                                                annotation_file, annotations_database
 3109                                            )
 3110                                            if len(found_files) > 0:
 3111                                                annotation_file_found = found_files[0]
 3112                                                break
 3113                                log.debug(
 3114                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3115                                )
 3116
 3117                                # Full path
 3118                                annotation_file_found = full_path(annotation_file_found)
 3119
 3120                                if annotation_file_found:
 3121
 3122                                    database = Database(database=annotation_file_found)
 3123                                    quick_annotation_format = database.get_format()
 3124                                    quick_annotation_is_compressed = (
 3125                                        database.is_compressed()
 3126                                    )
 3127                                    quick_annotation_is_indexed = os.path.exists(
 3128                                        f"{annotation_file_found}.tbi"
 3129                                    )
 3130                                    bcftools_preference = False
 3131
 3132                                    # Check Annotation Tool
 3133                                    if not annotation_tool:
 3134                                        if (
 3135                                            bcftools_preference
 3136                                            and quick_annotation_format
 3137                                            in ["vcf", "bed"]
 3138                                            and quick_annotation_is_compressed
 3139                                            and quick_annotation_is_indexed
 3140                                        ):
 3141                                            annotation_tool = "bcftools"
 3142                                        elif quick_annotation_format in [
 3143                                            "vcf",
 3144                                            "bed",
 3145                                            "tsv",
 3146                                            "tsv",
 3147                                            "csv",
 3148                                            "json",
 3149                                            "tbl",
 3150                                            "parquet",
 3151                                            "duckdb",
 3152                                        ]:
 3153                                            annotation_tool = "parquet"
 3154                                        else:
 3155                                            log.error(
 3156                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3157                                            )
 3158                                            raise ValueError(
 3159                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3160                                            )
 3161
 3162                                    log.debug(
 3163                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3164                                    )
 3165
 3166                                    # Annotation Tool dispatch
 3167                                    if annotation_tool:
 3168                                        if annotation_tool not in param["annotation"]:
 3169                                            param["annotation"][annotation_tool] = {}
 3170                                        if (
 3171                                            "annotations"
 3172                                            not in param["annotation"][annotation_tool]
 3173                                        ):
 3174                                            param["annotation"][annotation_tool][
 3175                                                "annotations"
 3176                                            ] = {}
 3177                                        param["annotation"][annotation_tool][
 3178                                            "annotations"
 3179                                        ][annotation_file_found] = annotations
 3180
 3181                                else:
 3182                                    log.error(
 3183                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3184                                    )
 3185
 3186                self.set_param(param)
 3187
 3188        if param.get("annotation", None):
 3189            log.info("Annotations")
 3190            if param.get("annotation", {}).get("parquet", None):
 3191                log.info("Annotations 'parquet'...")
 3192                self.annotation_parquet()
 3193            if param.get("annotation", {}).get("bcftools", None):
 3194                log.info("Annotations 'bcftools'...")
 3195                self.annotation_bcftools()
 3196            if param.get("annotation", {}).get("snpsift", None):
 3197                log.info("Annotations 'snpsift'...")
 3198                self.annotation_snpsift()
 3199            if param.get("annotation", {}).get("annovar", None):
 3200                log.info("Annotations 'annovar'...")
 3201                self.annotation_annovar()
 3202            if param.get("annotation", {}).get("snpeff", None):
 3203                log.info("Annotations 'snpeff'...")
 3204                self.annotation_snpeff()
 3205            if param.get("annotation", {}).get("exomiser", None) is not None:
 3206                log.info("Annotations 'exomiser'...")
 3207                self.annotation_exomiser()
 3208            if param.get("annotation", {}).get("splice", None) is not None:
 3209                log.info("Annotations 'splice' ...")
 3210                self.annotation_splice()
 3211
 3212        # Explode INFOS fields into table fields
 3213        if self.get_explode_infos():
 3214            self.explode_infos(
 3215                prefix=self.get_explode_infos_prefix(),
 3216                fields=self.get_explode_infos_fields(),
 3217                force=True,
 3218            )
 3219
 3220    def annotation_snpsift(self, threads: int = None) -> None:
 3221        """
 3222        This function annotate with bcftools
 3223
 3224        :param threads: Number of threads to use
 3225        :return: the value of the variable "return_value".
 3226        """
 3227
 3228        # DEBUG
 3229        log.debug("Start annotation with bcftools databases")
 3230
 3231        # Threads
 3232        if not threads:
 3233            threads = self.get_threads()
 3234        log.debug("Threads: " + str(threads))
 3235
 3236        # Config
 3237        config = self.get_config()
 3238        log.debug("Config: " + str(config))
 3239
 3240        # Config - snpSift
 3241        snpsift_bin_command = get_bin_command(
 3242            bin="SnpSift.jar",
 3243            tool="snpsift",
 3244            bin_type="jar",
 3245            config=config,
 3246            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3247        )
 3248        if not snpsift_bin_command:
 3249            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3250            log.error(msg_err)
 3251            raise ValueError(msg_err)
 3252
 3253        # Config - bcftools
 3254        bcftools_bin_command = get_bin_command(
 3255            bin="bcftools",
 3256            tool="bcftools",
 3257            bin_type="bin",
 3258            config=config,
 3259            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3260        )
 3261        if not bcftools_bin_command:
 3262            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3263            log.error(msg_err)
 3264            raise ValueError(msg_err)
 3265
 3266        # Config - BCFTools databases folders
 3267        databases_folders = set(
 3268            self.get_config()
 3269            .get("folders", {})
 3270            .get("databases", {})
 3271            .get("annotations", ["."])
 3272            + self.get_config()
 3273            .get("folders", {})
 3274            .get("databases", {})
 3275            .get("bcftools", ["."])
 3276        )
 3277        log.debug("Databases annotations: " + str(databases_folders))
 3278
 3279        # Param
 3280        annotations = (
 3281            self.get_param()
 3282            .get("annotation", {})
 3283            .get("snpsift", {})
 3284            .get("annotations", None)
 3285        )
 3286        log.debug("Annotations: " + str(annotations))
 3287
 3288        # Assembly
 3289        assembly = self.get_param().get(
 3290            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3291        )
 3292
 3293        # Data
 3294        table_variants = self.get_table_variants()
 3295
 3296        # Check if not empty
 3297        log.debug("Check if not empty")
 3298        sql_query_chromosomes = (
 3299            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3300        )
 3301        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3302        if not sql_query_chromosomes_df["count"][0]:
 3303            log.info(f"VCF empty")
 3304            return
 3305
 3306        # VCF header
 3307        vcf_reader = self.get_header()
 3308        log.debug("Initial header: " + str(vcf_reader.infos))
 3309
 3310        # Existing annotations
 3311        for vcf_annotation in self.get_header().infos:
 3312
 3313            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3314            log.debug(
 3315                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3316            )
 3317
 3318        if annotations:
 3319
 3320            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3321
 3322                # Export VCF file
 3323                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3324
 3325                # Init
 3326                commands = {}
 3327
 3328                for annotation in annotations:
 3329                    annotation_fields = annotations[annotation]
 3330
 3331                    # Annotation Name
 3332                    annotation_name = os.path.basename(annotation)
 3333
 3334                    if not annotation_fields:
 3335                        annotation_fields = {"INFO": None}
 3336
 3337                    log.debug(f"Annotation '{annotation_name}'")
 3338                    log.debug(
 3339                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3340                    )
 3341
 3342                    # Create Database
 3343                    database = Database(
 3344                        database=annotation,
 3345                        databases_folders=databases_folders,
 3346                        assembly=assembly,
 3347                    )
 3348
 3349                    # Find files
 3350                    db_file = database.get_database()
 3351                    db_file = full_path(db_file)
 3352                    db_hdr_file = database.get_header_file()
 3353                    db_hdr_file = full_path(db_hdr_file)
 3354                    db_file_type = database.get_format()
 3355                    db_tbi_file = f"{db_file}.tbi"
 3356                    db_file_compressed = database.is_compressed()
 3357
 3358                    # Check if compressed
 3359                    if not db_file_compressed:
 3360                        log.error(
 3361                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3362                        )
 3363                        raise ValueError(
 3364                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3365                        )
 3366
 3367                    # Check if indexed
 3368                    if not os.path.exists(db_tbi_file):
 3369                        log.error(
 3370                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3371                        )
 3372                        raise ValueError(
 3373                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3374                        )
 3375
 3376                    # Check index - try to create if not exists
 3377                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3378                        log.error("Annotation failed: database not valid")
 3379                        log.error(f"Annotation annotation file: {db_file}")
 3380                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3381                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3382                        raise ValueError(
 3383                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3384                        )
 3385                    else:
 3386
 3387                        log.debug(
 3388                            f"Annotation '{annotation}' - file: "
 3389                            + str(db_file)
 3390                            + " and "
 3391                            + str(db_hdr_file)
 3392                        )
 3393
 3394                        # Load header as VCF object
 3395                        db_hdr_vcf = Variants(input=db_hdr_file)
 3396                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3397                        log.debug(
 3398                            "Annotation database header: "
 3399                            + str(db_hdr_vcf_header_infos)
 3400                        )
 3401
 3402                        # For all fields in database
 3403                        annotation_fields_full = False
 3404                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3405                            annotation_fields = {
 3406                                key: key for key in db_hdr_vcf_header_infos
 3407                            }
 3408                            log.debug(
 3409                                "Annotation database header - All annotations added: "
 3410                                + str(annotation_fields)
 3411                            )
 3412                            annotation_fields_full = True
 3413
 3414                        # # Create file for field rename
 3415                        # log.debug("Create file for field rename")
 3416                        # tmp_rename = NamedTemporaryFile(
 3417                        #     prefix=self.get_prefix(),
 3418                        #     dir=self.get_tmp_dir(),
 3419                        #     suffix=".rename",
 3420                        #     delete=False,
 3421                        # )
 3422                        # tmp_rename_name = tmp_rename.name
 3423                        # tmp_files.append(tmp_rename_name)
 3424
 3425                        # Number of fields
 3426                        nb_annotation_field = 0
 3427                        annotation_list = []
 3428                        annotation_infos_rename_list = []
 3429
 3430                        for annotation_field in annotation_fields:
 3431
 3432                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3433                            annotation_fields_new_name = annotation_fields.get(
 3434                                annotation_field, annotation_field
 3435                            )
 3436                            if not annotation_fields_new_name:
 3437                                annotation_fields_new_name = annotation_field
 3438
 3439                            # Check if field is in DB and if field is not elready in input data
 3440                            if (
 3441                                annotation_field in db_hdr_vcf.get_header().infos
 3442                                and annotation_fields_new_name
 3443                                not in self.get_header().infos
 3444                            ):
 3445
 3446                                log.info(
 3447                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3448                                )
 3449
 3450                                # BCFTools annotate param to rename fields
 3451                                if annotation_field != annotation_fields_new_name:
 3452                                    annotation_infos_rename_list.append(
 3453                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3454                                    )
 3455
 3456                                # Add INFO field to header
 3457                                db_hdr_vcf_header_infos_number = (
 3458                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3459                                )
 3460                                db_hdr_vcf_header_infos_type = (
 3461                                    db_hdr_vcf_header_infos[annotation_field].type
 3462                                    or "String"
 3463                                )
 3464                                db_hdr_vcf_header_infos_description = (
 3465                                    db_hdr_vcf_header_infos[annotation_field].desc
 3466                                    or f"{annotation_field} description"
 3467                                )
 3468                                db_hdr_vcf_header_infos_source = (
 3469                                    db_hdr_vcf_header_infos[annotation_field].source
 3470                                    or "unknown"
 3471                                )
 3472                                db_hdr_vcf_header_infos_version = (
 3473                                    db_hdr_vcf_header_infos[annotation_field].version
 3474                                    or "unknown"
 3475                                )
 3476
 3477                                vcf_reader.infos[annotation_fields_new_name] = (
 3478                                    vcf.parser._Info(
 3479                                        annotation_fields_new_name,
 3480                                        db_hdr_vcf_header_infos_number,
 3481                                        db_hdr_vcf_header_infos_type,
 3482                                        db_hdr_vcf_header_infos_description,
 3483                                        db_hdr_vcf_header_infos_source,
 3484                                        db_hdr_vcf_header_infos_version,
 3485                                        self.code_type_map[
 3486                                            db_hdr_vcf_header_infos_type
 3487                                        ],
 3488                                    )
 3489                                )
 3490
 3491                                annotation_list.append(annotation_field)
 3492
 3493                                nb_annotation_field += 1
 3494
 3495                            else:
 3496
 3497                                if (
 3498                                    annotation_field
 3499                                    not in db_hdr_vcf.get_header().infos
 3500                                ):
 3501                                    log.warning(
 3502                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3503                                    )
 3504                                if (
 3505                                    annotation_fields_new_name
 3506                                    in self.get_header().infos
 3507                                ):
 3508                                    log.warning(
 3509                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3510                                    )
 3511
 3512                        log.info(
 3513                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3514                        )
 3515
 3516                        annotation_infos = ",".join(annotation_list)
 3517
 3518                        if annotation_infos != "":
 3519
 3520                            # Annotated VCF (and error file)
 3521                            tmp_annotation_vcf_name = os.path.join(
 3522                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3523                            )
 3524                            tmp_annotation_vcf_name_err = (
 3525                                tmp_annotation_vcf_name + ".err"
 3526                            )
 3527
 3528                            # Add fields to annotate
 3529                            if not annotation_fields_full:
 3530                                annotation_infos_option = f"-info {annotation_infos}"
 3531                            else:
 3532                                annotation_infos_option = ""
 3533
 3534                            # Info fields rename
 3535                            if annotation_infos_rename_list:
 3536                                annotation_infos_rename = " -c " + ",".join(
 3537                                    annotation_infos_rename_list
 3538                                )
 3539                            else:
 3540                                annotation_infos_rename = ""
 3541
 3542                            # Annotate command
 3543                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3544
 3545                            # Add command
 3546                            commands[command_annotate] = tmp_annotation_vcf_name
 3547
 3548                if commands:
 3549
 3550                    # Export VCF file
 3551                    self.export_variant_vcf(
 3552                        vcf_file=tmp_vcf_name,
 3553                        remove_info=True,
 3554                        add_samples=False,
 3555                        index=True,
 3556                    )
 3557                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3558
 3559                    # Num command
 3560                    nb_command = 0
 3561
 3562                    # Annotate
 3563                    for command_annotate in commands:
 3564                        nb_command += 1
 3565                        log.info(
 3566                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3567                        )
 3568                        log.debug(f"command_annotate={command_annotate}")
 3569                        run_parallel_commands([command_annotate], threads)
 3570
 3571                        # Debug
 3572                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3573
 3574                        # Update variants
 3575                        log.info(
 3576                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3577                        )
 3578                        self.update_from_vcf(commands[command_annotate])
 3579
 3580    def annotation_bcftools(self, threads: int = None) -> None:
 3581        """
 3582        This function annotate with bcftools
 3583
 3584        :param threads: Number of threads to use
 3585        :return: the value of the variable "return_value".
 3586        """
 3587
 3588        # DEBUG
 3589        log.debug("Start annotation with bcftools databases")
 3590
 3591        # Threads
 3592        if not threads:
 3593            threads = self.get_threads()
 3594        log.debug("Threads: " + str(threads))
 3595
 3596        # Config
 3597        config = self.get_config()
 3598        log.debug("Config: " + str(config))
 3599
 3600        # DEBUG
 3601        delete_tmp = True
 3602        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3603            delete_tmp = False
 3604            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3605
 3606        # Config - BCFTools bin command
 3607        bcftools_bin_command = get_bin_command(
 3608            bin="bcftools",
 3609            tool="bcftools",
 3610            bin_type="bin",
 3611            config=config,
 3612            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3613        )
 3614        if not bcftools_bin_command:
 3615            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3616            log.error(msg_err)
 3617            raise ValueError(msg_err)
 3618
 3619        # Config - BCFTools databases folders
 3620        databases_folders = set(
 3621            self.get_config()
 3622            .get("folders", {})
 3623            .get("databases", {})
 3624            .get("annotations", ["."])
 3625            + self.get_config()
 3626            .get("folders", {})
 3627            .get("databases", {})
 3628            .get("bcftools", ["."])
 3629        )
 3630        log.debug("Databases annotations: " + str(databases_folders))
 3631
 3632        # Param
 3633        annotations = (
 3634            self.get_param()
 3635            .get("annotation", {})
 3636            .get("bcftools", {})
 3637            .get("annotations", None)
 3638        )
 3639        log.debug("Annotations: " + str(annotations))
 3640
 3641        # Assembly
 3642        assembly = self.get_param().get(
 3643            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3644        )
 3645
 3646        # Data
 3647        table_variants = self.get_table_variants()
 3648
 3649        # Check if not empty
 3650        log.debug("Check if not empty")
 3651        sql_query_chromosomes = (
 3652            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3653        )
 3654        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3655        if not sql_query_chromosomes_df["count"][0]:
 3656            log.info(f"VCF empty")
 3657            return
 3658
 3659        # Export in VCF
 3660        log.debug("Create initial file to annotate")
 3661        tmp_vcf = NamedTemporaryFile(
 3662            prefix=self.get_prefix(),
 3663            dir=self.get_tmp_dir(),
 3664            suffix=".vcf.gz",
 3665            delete=False,
 3666        )
 3667        tmp_vcf_name = tmp_vcf.name
 3668
 3669        # VCF header
 3670        vcf_reader = self.get_header()
 3671        log.debug("Initial header: " + str(vcf_reader.infos))
 3672
 3673        # Existing annotations
 3674        for vcf_annotation in self.get_header().infos:
 3675
 3676            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3677            log.debug(
 3678                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3679            )
 3680
 3681        if annotations:
 3682
 3683            tmp_ann_vcf_list = []
 3684            commands = []
 3685            tmp_files = []
 3686            err_files = []
 3687
 3688            for annotation in annotations:
 3689                annotation_fields = annotations[annotation]
 3690
 3691                # Annotation Name
 3692                annotation_name = os.path.basename(annotation)
 3693
 3694                if not annotation_fields:
 3695                    annotation_fields = {"INFO": None}
 3696
 3697                log.debug(f"Annotation '{annotation_name}'")
 3698                log.debug(
 3699                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3700                )
 3701
 3702                # Create Database
 3703                database = Database(
 3704                    database=annotation,
 3705                    databases_folders=databases_folders,
 3706                    assembly=assembly,
 3707                )
 3708
 3709                # Find files
 3710                db_file = database.get_database()
 3711                db_file = full_path(db_file)
 3712                db_hdr_file = database.get_header_file()
 3713                db_hdr_file = full_path(db_hdr_file)
 3714                db_file_type = database.get_format()
 3715                db_tbi_file = f"{db_file}.tbi"
 3716                db_file_compressed = database.is_compressed()
 3717
 3718                # Check if compressed
 3719                if not db_file_compressed:
 3720                    log.error(
 3721                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3722                    )
 3723                    raise ValueError(
 3724                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3725                    )
 3726
 3727                # Check if indexed
 3728                if not os.path.exists(db_tbi_file):
 3729                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3730                    raise ValueError(
 3731                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3732                    )
 3733
 3734                # Check index - try to create if not exists
 3735                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3736                    log.error("Annotation failed: database not valid")
 3737                    log.error(f"Annotation annotation file: {db_file}")
 3738                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3739                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3740                    raise ValueError(
 3741                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3742                    )
 3743                else:
 3744
 3745                    log.debug(
 3746                        f"Annotation '{annotation}' - file: "
 3747                        + str(db_file)
 3748                        + " and "
 3749                        + str(db_hdr_file)
 3750                    )
 3751
 3752                    # Load header as VCF object
 3753                    db_hdr_vcf = Variants(input=db_hdr_file)
 3754                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3755                    log.debug(
 3756                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3757                    )
 3758
 3759                    # For all fields in database
 3760                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3761                        annotation_fields = {
 3762                            key: key for key in db_hdr_vcf_header_infos
 3763                        }
 3764                        log.debug(
 3765                            "Annotation database header - All annotations added: "
 3766                            + str(annotation_fields)
 3767                        )
 3768
 3769                    # Number of fields
 3770                    nb_annotation_field = 0
 3771                    annotation_list = []
 3772
 3773                    for annotation_field in annotation_fields:
 3774
 3775                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3776                        annotation_fields_new_name = annotation_fields.get(
 3777                            annotation_field, annotation_field
 3778                        )
 3779                        if not annotation_fields_new_name:
 3780                            annotation_fields_new_name = annotation_field
 3781
 3782                        # Check if field is in DB and if field is not elready in input data
 3783                        if (
 3784                            annotation_field in db_hdr_vcf.get_header().infos
 3785                            and annotation_fields_new_name
 3786                            not in self.get_header().infos
 3787                        ):
 3788
 3789                            log.info(
 3790                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3791                            )
 3792
 3793                            # Add INFO field to header
 3794                            db_hdr_vcf_header_infos_number = (
 3795                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3796                            )
 3797                            db_hdr_vcf_header_infos_type = (
 3798                                db_hdr_vcf_header_infos[annotation_field].type
 3799                                or "String"
 3800                            )
 3801                            db_hdr_vcf_header_infos_description = (
 3802                                db_hdr_vcf_header_infos[annotation_field].desc
 3803                                or f"{annotation_field} description"
 3804                            )
 3805                            db_hdr_vcf_header_infos_source = (
 3806                                db_hdr_vcf_header_infos[annotation_field].source
 3807                                or "unknown"
 3808                            )
 3809                            db_hdr_vcf_header_infos_version = (
 3810                                db_hdr_vcf_header_infos[annotation_field].version
 3811                                or "unknown"
 3812                            )
 3813
 3814                            vcf_reader.infos[annotation_fields_new_name] = (
 3815                                vcf.parser._Info(
 3816                                    annotation_fields_new_name,
 3817                                    db_hdr_vcf_header_infos_number,
 3818                                    db_hdr_vcf_header_infos_type,
 3819                                    db_hdr_vcf_header_infos_description,
 3820                                    db_hdr_vcf_header_infos_source,
 3821                                    db_hdr_vcf_header_infos_version,
 3822                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3823                                )
 3824                            )
 3825
 3826                            # annotation_list.append(annotation_field)
 3827                            if annotation_field != annotation_fields_new_name:
 3828                                annotation_list.append(
 3829                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3830                                )
 3831                            else:
 3832                                annotation_list.append(annotation_field)
 3833
 3834                            nb_annotation_field += 1
 3835
 3836                        else:
 3837
 3838                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3839                                log.warning(
 3840                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3841                                )
 3842                            if annotation_fields_new_name in self.get_header().infos:
 3843                                log.warning(
 3844                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3845                                )
 3846
 3847                    log.info(
 3848                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3849                    )
 3850
 3851                    annotation_infos = ",".join(annotation_list)
 3852
 3853                    if annotation_infos != "":
 3854
 3855                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3856                        log.debug("Protect Header file - remove #CHROM line if exists")
 3857                        tmp_header_vcf = NamedTemporaryFile(
 3858                            prefix=self.get_prefix(),
 3859                            dir=self.get_tmp_dir(),
 3860                            suffix=".hdr",
 3861                            delete=False,
 3862                        )
 3863                        tmp_header_vcf_name = tmp_header_vcf.name
 3864                        tmp_files.append(tmp_header_vcf_name)
 3865                        # Command
 3866                        if db_hdr_file.endswith(".gz"):
 3867                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3868                        else:
 3869                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3870                        # Run
 3871                        run_parallel_commands([command_extract_header], 1)
 3872
 3873                        # Find chomosomes
 3874                        log.debug("Find chromosomes ")
 3875                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3876                        sql_query_chromosomes_df = self.get_query_to_df(
 3877                            sql_query_chromosomes
 3878                        )
 3879                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3880
 3881                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3882
 3883                        # BED columns in the annotation file
 3884                        if db_file_type in ["bed"]:
 3885                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3886
 3887                        for chrom in chomosomes_list:
 3888
 3889                            # Create BED on initial VCF
 3890                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3891                            tmp_bed = NamedTemporaryFile(
 3892                                prefix=self.get_prefix(),
 3893                                dir=self.get_tmp_dir(),
 3894                                suffix=".bed",
 3895                                delete=False,
 3896                            )
 3897                            tmp_bed_name = tmp_bed.name
 3898                            tmp_files.append(tmp_bed_name)
 3899
 3900                            # Detecte regions
 3901                            log.debug(
 3902                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3903                            )
 3904                            window = 1000000
 3905                            sql_query_intervals_for_bed = f"""
 3906                                SELECT  \"#CHROM\",
 3907                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3908                                        \"POS\"+{window}
 3909                                FROM {table_variants} as table_variants
 3910                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3911                            """
 3912                            regions = self.conn.execute(
 3913                                sql_query_intervals_for_bed
 3914                            ).fetchall()
 3915                            merged_regions = merge_regions(regions)
 3916                            log.debug(
 3917                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3918                            )
 3919
 3920                            header = ["#CHROM", "START", "END"]
 3921                            with open(tmp_bed_name, "w") as f:
 3922                                # Write the header with tab delimiter
 3923                                f.write("\t".join(header) + "\n")
 3924                                for d in merged_regions:
 3925                                    # Write each data row with tab delimiter
 3926                                    f.write("\t".join(map(str, d)) + "\n")
 3927
 3928                            # Tmp files
 3929                            tmp_annotation_vcf = NamedTemporaryFile(
 3930                                prefix=self.get_prefix(),
 3931                                dir=self.get_tmp_dir(),
 3932                                suffix=".vcf.gz",
 3933                                delete=False,
 3934                            )
 3935                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3936                            tmp_files.append(tmp_annotation_vcf_name)
 3937                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3938                            tmp_annotation_vcf_name_err = (
 3939                                tmp_annotation_vcf_name + ".err"
 3940                            )
 3941                            err_files.append(tmp_annotation_vcf_name_err)
 3942
 3943                            # Annotate Command
 3944                            log.debug(
 3945                                f"Annotation '{annotation}' - add bcftools command"
 3946                            )
 3947
 3948                            # Command
 3949                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3950
 3951                            # Add command
 3952                            commands.append(command_annotate)
 3953
 3954            # if some commands
 3955            if commands:
 3956
 3957                # Export VCF file
 3958                self.export_variant_vcf(
 3959                    vcf_file=tmp_vcf_name,
 3960                    remove_info=True,
 3961                    add_samples=False,
 3962                    index=True,
 3963                )
 3964
 3965                # Threads
 3966                # calculate threads for annotated commands
 3967                if commands:
 3968                    threads_bcftools_annotate = round(threads / len(commands))
 3969                else:
 3970                    threads_bcftools_annotate = 1
 3971
 3972                if not threads_bcftools_annotate:
 3973                    threads_bcftools_annotate = 1
 3974
 3975                # Add threads option to bcftools commands
 3976                if threads_bcftools_annotate > 1:
 3977                    commands_threaded = []
 3978                    for command in commands:
 3979                        commands_threaded.append(
 3980                            command.replace(
 3981                                f"{bcftools_bin_command} annotate ",
 3982                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3983                            )
 3984                        )
 3985                    commands = commands_threaded
 3986
 3987                # Command annotation multithreading
 3988                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3989                log.info(
 3990                    f"Annotation - Annotation multithreaded in "
 3991                    + str(len(commands))
 3992                    + " commands"
 3993                )
 3994
 3995                run_parallel_commands(commands, threads)
 3996
 3997                # Merge
 3998                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 3999
 4000                if tmp_ann_vcf_list_cmd:
 4001
 4002                    # Tmp file
 4003                    tmp_annotate_vcf = NamedTemporaryFile(
 4004                        prefix=self.get_prefix(),
 4005                        dir=self.get_tmp_dir(),
 4006                        suffix=".vcf.gz",
 4007                        delete=True,
 4008                    )
 4009                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4010                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4011                    err_files.append(tmp_annotate_vcf_name_err)
 4012
 4013                    # Tmp file remove command
 4014                    tmp_files_remove_command = ""
 4015                    if tmp_files:
 4016                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4017
 4018                    # Command merge
 4019                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4020                    log.info(
 4021                        f"Annotation - Annotation merging "
 4022                        + str(len(commands))
 4023                        + " annotated files"
 4024                    )
 4025                    log.debug(f"Annotation - merge command: {merge_command}")
 4026                    run_parallel_commands([merge_command], 1)
 4027
 4028                    # Error messages
 4029                    log.info(f"Error/Warning messages:")
 4030                    error_message_command_all = []
 4031                    error_message_command_warning = []
 4032                    error_message_command_err = []
 4033                    for err_file in err_files:
 4034                        with open(err_file, "r") as f:
 4035                            for line in f:
 4036                                message = line.strip()
 4037                                error_message_command_all.append(message)
 4038                                if line.startswith("[W::"):
 4039                                    error_message_command_warning.append(message)
 4040                                if line.startswith("[E::"):
 4041                                    error_message_command_err.append(
 4042                                        f"{err_file}: " + message
 4043                                    )
 4044                    # log info
 4045                    for message in list(
 4046                        set(error_message_command_err + error_message_command_warning)
 4047                    ):
 4048                        log.info(f"   {message}")
 4049                    # debug info
 4050                    for message in list(set(error_message_command_all)):
 4051                        log.debug(f"   {message}")
 4052                    # failed
 4053                    if len(error_message_command_err):
 4054                        log.error("Annotation failed: Error in commands")
 4055                        raise ValueError("Annotation failed: Error in commands")
 4056
 4057                    # Update variants
 4058                    log.info(f"Annotation - Updating...")
 4059                    self.update_from_vcf(tmp_annotate_vcf_name)
 4060
 4061    def annotation_exomiser(self, threads: int = None) -> None:
 4062        """
 4063        This function annotate with Exomiser
 4064
 4065        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4066        - "analysis" (dict/file):
 4067            Full analysis dictionnary parameters (see Exomiser docs).
 4068            Either a dict, or a file in JSON or YAML format.
 4069            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4070            Default : None
 4071        - "preset" (string):
 4072            Analysis preset (available in config folder).
 4073            Used if no full "analysis" is provided.
 4074            Default: "exome"
 4075        - "phenopacket" (dict/file):
 4076            Samples and phenotipic features parameters (see Exomiser docs).
 4077            Either a dict, or a file in JSON or YAML format.
 4078            Default: None
 4079        - "subject" (dict):
 4080            Sample parameters (see Exomiser docs).
 4081            Example:
 4082                "subject":
 4083                    {
 4084                        "id": "ISDBM322017",
 4085                        "sex": "FEMALE"
 4086                    }
 4087            Default: None
 4088        - "sample" (string):
 4089            Sample name to construct "subject" section:
 4090                "subject":
 4091                    {
 4092                        "id": "<sample>",
 4093                        "sex": "UNKNOWN_SEX"
 4094                    }
 4095            Default: None
 4096        - "phenotypicFeatures" (dict)
 4097            Phenotypic features to construct "subject" section.
 4098            Example:
 4099                "phenotypicFeatures":
 4100                    [
 4101                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4102                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4103                    ]
 4104        - "hpo" (list)
 4105            List of HPO ids as phenotypic features.
 4106            Example:
 4107                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4108            Default: []
 4109        - "outputOptions" (dict):
 4110            Output options (see Exomiser docs).
 4111            Default:
 4112                "output_options" =
 4113                    {
 4114                        "outputContributingVariantsOnly": False,
 4115                        "numGenes": 0,
 4116                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4117                    }
 4118        - "transcript_source" (string):
 4119            Transcript source (either "refseq", "ucsc", "ensembl")
 4120            Default: "refseq"
 4121        - "exomiser_to_info" (boolean):
 4122            Add exomiser TSV file columns as INFO fields in VCF.
 4123            Default: False
 4124        - "release" (string):
 4125            Exomise database release.
 4126            If not exists, database release will be downloaded (take a while).
 4127            Default: None (provided by application.properties configuration file)
 4128        - "exomiser_application_properties" (file):
 4129            Exomiser configuration file (see Exomiser docs).
 4130            Useful to automatically download databases (especially for specific genome databases).
 4131
 4132        Notes:
 4133        - If no sample in parameters, first sample in VCF will be chosen
 4134        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4135
 4136        :param threads: The number of threads to use
 4137        :return: None.
 4138        """
 4139
 4140        # DEBUG
 4141        log.debug("Start annotation with Exomiser databases")
 4142
 4143        # Threads
 4144        if not threads:
 4145            threads = self.get_threads()
 4146        log.debug("Threads: " + str(threads))
 4147
 4148        # Config
 4149        config = self.get_config()
 4150        log.debug("Config: " + str(config))
 4151
 4152        # Config - Folders - Databases
 4153        databases_folders = (
 4154            config.get("folders", {})
 4155            .get("databases", {})
 4156            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4157        )
 4158        databases_folders = full_path(databases_folders)
 4159        if not os.path.exists(databases_folders):
 4160            log.error(f"Databases annotations: {databases_folders} NOT found")
 4161        log.debug("Databases annotations: " + str(databases_folders))
 4162
 4163        # Config - Exomiser
 4164        exomiser_bin_command = get_bin_command(
 4165            bin="exomiser-cli*.jar",
 4166            tool="exomiser",
 4167            bin_type="jar",
 4168            config=config,
 4169            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4170        )
 4171        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4172        if not exomiser_bin_command:
 4173            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4174            log.error(msg_err)
 4175            raise ValueError(msg_err)
 4176
 4177        # Param
 4178        param = self.get_param()
 4179        log.debug("Param: " + str(param))
 4180
 4181        # Param - Exomiser
 4182        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4183        log.debug(f"Param Exomiser: {param_exomiser}")
 4184
 4185        # Param - Assembly
 4186        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4187        log.debug("Assembly: " + str(assembly))
 4188
 4189        # Data
 4190        table_variants = self.get_table_variants()
 4191
 4192        # Check if not empty
 4193        log.debug("Check if not empty")
 4194        sql_query_chromosomes = (
 4195            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4196        )
 4197        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4198            log.info(f"VCF empty")
 4199            return False
 4200
 4201        # VCF header
 4202        vcf_reader = self.get_header()
 4203        log.debug("Initial header: " + str(vcf_reader.infos))
 4204
 4205        # Samples
 4206        samples = self.get_header_sample_list()
 4207        if not samples:
 4208            log.error("No Samples in VCF")
 4209            return False
 4210        log.debug(f"Samples: {samples}")
 4211
 4212        # Memory limit
 4213        memory_limit = self.get_memory("8G")
 4214        log.debug(f"memory_limit: {memory_limit}")
 4215
 4216        # Exomiser java options
 4217        exomiser_java_options = (
 4218            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4219        )
 4220        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4221
 4222        # Download Exomiser (if not exists)
 4223        exomiser_release = param_exomiser.get("release", None)
 4224        exomiser_application_properties = param_exomiser.get(
 4225            "exomiser_application_properties", None
 4226        )
 4227        databases_download_exomiser(
 4228            assemblies=[assembly],
 4229            exomiser_folder=databases_folders,
 4230            exomiser_release=exomiser_release,
 4231            exomiser_phenotype_release=exomiser_release,
 4232            exomiser_application_properties=exomiser_application_properties,
 4233        )
 4234
 4235        # Force annotation
 4236        force_update_annotation = True
 4237
 4238        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4239            log.debug("Start annotation Exomiser")
 4240
 4241            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4242
 4243                # tmp_dir = "/tmp/exomiser"
 4244
 4245                ### ANALYSIS ###
 4246                ################
 4247
 4248                # Create analysis.json through analysis dict
 4249                # either analysis in param or by default
 4250                # depending on preset exome/genome)
 4251
 4252                # Init analysis dict
 4253                param_exomiser_analysis_dict = {}
 4254
 4255                # analysis from param
 4256                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4257                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4258
 4259                # If analysis in param -> load anlaysis json
 4260                if param_exomiser_analysis:
 4261
 4262                    # If param analysis is a file and exists
 4263                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4264                        param_exomiser_analysis
 4265                    ):
 4266                        # Load analysis file into analysis dict (either yaml or json)
 4267                        with open(param_exomiser_analysis) as json_file:
 4268                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4269
 4270                    # If param analysis is a dict
 4271                    elif isinstance(param_exomiser_analysis, dict):
 4272                        # Load analysis dict into analysis dict (either yaml or json)
 4273                        param_exomiser_analysis_dict = param_exomiser_analysis
 4274
 4275                    # Error analysis type
 4276                    else:
 4277                        log.error(f"Analysis type unknown. Check param file.")
 4278                        raise ValueError(f"Analysis type unknown. Check param file.")
 4279
 4280                # Case no input analysis config file/dict
 4281                # Use preset (exome/genome) to open default config file
 4282                if not param_exomiser_analysis_dict:
 4283
 4284                    # default preset
 4285                    default_preset = "exome"
 4286
 4287                    # Get param preset or default preset
 4288                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4289
 4290                    # Try to find if preset is a file
 4291                    if os.path.exists(param_exomiser_preset):
 4292                        # Preset file is provided in full path
 4293                        param_exomiser_analysis_default_config_file = (
 4294                            param_exomiser_preset
 4295                        )
 4296                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4297                    #     # Preset file is provided in full path
 4298                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4299                    elif os.path.exists(
 4300                        os.path.join(folder_config, param_exomiser_preset)
 4301                    ):
 4302                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4303                        param_exomiser_analysis_default_config_file = os.path.join(
 4304                            folder_config, param_exomiser_preset
 4305                        )
 4306                    else:
 4307                        # Construct preset file
 4308                        param_exomiser_analysis_default_config_file = os.path.join(
 4309                            folder_config,
 4310                            f"preset-{param_exomiser_preset}-analysis.json",
 4311                        )
 4312
 4313                    # If preset file exists
 4314                    param_exomiser_analysis_default_config_file = full_path(
 4315                        param_exomiser_analysis_default_config_file
 4316                    )
 4317                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4318                        # Load prest file into analysis dict (either yaml or json)
 4319                        with open(
 4320                            param_exomiser_analysis_default_config_file
 4321                        ) as json_file:
 4322                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4323                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4324                                json_file
 4325                            )
 4326
 4327                    # Error preset file
 4328                    else:
 4329                        log.error(
 4330                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4331                        )
 4332                        raise ValueError(
 4333                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4334                        )
 4335
 4336                # If no analysis dict created
 4337                if not param_exomiser_analysis_dict:
 4338                    log.error(f"No analysis config")
 4339                    raise ValueError(f"No analysis config")
 4340
 4341                # Log
 4342                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4343
 4344                ### PHENOPACKET ###
 4345                ###################
 4346
 4347                # If no PhenoPacket in analysis dict -> check in param
 4348                if "phenopacket" not in param_exomiser_analysis_dict:
 4349
 4350                    # If PhenoPacket in param -> load anlaysis json
 4351                    if param_exomiser.get("phenopacket", None):
 4352
 4353                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4354                        param_exomiser_phenopacket = full_path(
 4355                            param_exomiser_phenopacket
 4356                        )
 4357
 4358                        # If param phenopacket is a file and exists
 4359                        if isinstance(
 4360                            param_exomiser_phenopacket, str
 4361                        ) and os.path.exists(param_exomiser_phenopacket):
 4362                            # Load phenopacket file into analysis dict (either yaml or json)
 4363                            with open(param_exomiser_phenopacket) as json_file:
 4364                                param_exomiser_analysis_dict["phenopacket"] = (
 4365                                    yaml.safe_load(json_file)
 4366                                )
 4367
 4368                        # If param phenopacket is a dict
 4369                        elif isinstance(param_exomiser_phenopacket, dict):
 4370                            # Load phenopacket dict into analysis dict (either yaml or json)
 4371                            param_exomiser_analysis_dict["phenopacket"] = (
 4372                                param_exomiser_phenopacket
 4373                            )
 4374
 4375                        # Error phenopacket type
 4376                        else:
 4377                            log.error(f"Phenopacket type unknown. Check param file.")
 4378                            raise ValueError(
 4379                                f"Phenopacket type unknown. Check param file."
 4380                            )
 4381
 4382                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4383                if "phenopacket" not in param_exomiser_analysis_dict:
 4384
 4385                    # Init PhenoPacket
 4386                    param_exomiser_analysis_dict["phenopacket"] = {
 4387                        "id": "analysis",
 4388                        "proband": {},
 4389                    }
 4390
 4391                    ### Add subject ###
 4392
 4393                    # If subject exists
 4394                    param_exomiser_subject = param_exomiser.get("subject", {})
 4395
 4396                    # If subject not exists -> found sample ID
 4397                    if not param_exomiser_subject:
 4398
 4399                        # Found sample ID in param
 4400                        sample = param_exomiser.get("sample", None)
 4401
 4402                        # Find sample ID (first sample)
 4403                        if not sample:
 4404                            sample_list = self.get_header_sample_list()
 4405                            if len(sample_list) > 0:
 4406                                sample = sample_list[0]
 4407                            else:
 4408                                log.error(f"No sample found")
 4409                                raise ValueError(f"No sample found")
 4410
 4411                        # Create subject
 4412                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4413
 4414                    # Add to dict
 4415                    param_exomiser_analysis_dict["phenopacket"][
 4416                        "subject"
 4417                    ] = param_exomiser_subject
 4418
 4419                    ### Add "phenotypicFeatures" ###
 4420
 4421                    # If phenotypicFeatures exists
 4422                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4423                        "phenotypicFeatures", []
 4424                    )
 4425
 4426                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4427                    if not param_exomiser_phenotypicfeatures:
 4428
 4429                        # Found HPO in param
 4430                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4431
 4432                        # Split HPO if list in string format separated by comma
 4433                        if isinstance(param_exomiser_hpo, str):
 4434                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4435
 4436                        # Create HPO list
 4437                        for hpo in param_exomiser_hpo:
 4438                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4439                            param_exomiser_phenotypicfeatures.append(
 4440                                {
 4441                                    "type": {
 4442                                        "id": f"HP:{hpo_clean}",
 4443                                        "label": f"HP:{hpo_clean}",
 4444                                    }
 4445                                }
 4446                            )
 4447
 4448                    # Add to dict
 4449                    param_exomiser_analysis_dict["phenopacket"][
 4450                        "phenotypicFeatures"
 4451                    ] = param_exomiser_phenotypicfeatures
 4452
 4453                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4454                    if not param_exomiser_phenotypicfeatures:
 4455                        for step in param_exomiser_analysis_dict.get(
 4456                            "analysis", {}
 4457                        ).get("steps", []):
 4458                            if "hiPhivePrioritiser" in step:
 4459                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4460                                    "steps", []
 4461                                ).remove(step)
 4462
 4463                ### Add Input File ###
 4464
 4465                # Initial file name and htsFiles
 4466                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4467                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4468                    {
 4469                        "uri": tmp_vcf_name,
 4470                        "htsFormat": "VCF",
 4471                        "genomeAssembly": assembly,
 4472                    }
 4473                ]
 4474
 4475                ### Add metaData ###
 4476
 4477                # If metaData not in analysis dict
 4478                if "metaData" not in param_exomiser_analysis_dict:
 4479                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4480                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4481                        "createdBy": "howard",
 4482                        "phenopacketSchemaVersion": 1,
 4483                    }
 4484
 4485                ### OutputOptions ###
 4486
 4487                # Init output result folder
 4488                output_results = os.path.join(tmp_dir, "results")
 4489
 4490                # If no outputOptions in analysis dict
 4491                if "outputOptions" not in param_exomiser_analysis_dict:
 4492
 4493                    # default output formats
 4494                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4495
 4496                    # Get outputOptions in param
 4497                    output_options = param_exomiser.get("outputOptions", None)
 4498
 4499                    # If no output_options in param -> check
 4500                    if not output_options:
 4501                        output_options = {
 4502                            "outputContributingVariantsOnly": False,
 4503                            "numGenes": 0,
 4504                            "outputFormats": defaut_output_formats,
 4505                        }
 4506
 4507                    # Replace outputDirectory in output options
 4508                    output_options["outputDirectory"] = output_results
 4509                    output_options["outputFileName"] = "howard"
 4510
 4511                    # Add outputOptions in analysis dict
 4512                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4513
 4514                else:
 4515
 4516                    # Replace output_results and output format (if exists in param)
 4517                    param_exomiser_analysis_dict["outputOptions"][
 4518                        "outputDirectory"
 4519                    ] = output_results
 4520                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4521                        list(
 4522                            set(
 4523                                param_exomiser_analysis_dict.get(
 4524                                    "outputOptions", {}
 4525                                ).get("outputFormats", [])
 4526                                + ["TSV_VARIANT", "VCF"]
 4527                            )
 4528                        )
 4529                    )
 4530
 4531                # log
 4532                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4533
 4534                ### ANALYSIS FILE ###
 4535                #####################
 4536
 4537                ### Full JSON analysis config file ###
 4538
 4539                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4540                with open(exomiser_analysis, "w") as fp:
 4541                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4542
 4543                ### SPLIT analysis and sample config files
 4544
 4545                # Splitted analysis dict
 4546                param_exomiser_analysis_dict_for_split = (
 4547                    param_exomiser_analysis_dict.copy()
 4548                )
 4549
 4550                # Phenopacket JSON file
 4551                exomiser_analysis_phenopacket = os.path.join(
 4552                    tmp_dir, "analysis_phenopacket.json"
 4553                )
 4554                with open(exomiser_analysis_phenopacket, "w") as fp:
 4555                    json.dump(
 4556                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4557                        fp,
 4558                        indent=4,
 4559                    )
 4560
 4561                # Analysis JSON file without Phenopacket parameters
 4562                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4563                exomiser_analysis_analysis = os.path.join(
 4564                    tmp_dir, "analysis_analysis.json"
 4565                )
 4566                with open(exomiser_analysis_analysis, "w") as fp:
 4567                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4568
 4569                ### INITAL VCF file ###
 4570                #######################
 4571
 4572                ### Create list of samples to use and include inti initial VCF file ####
 4573
 4574                # Subject (main sample)
 4575                # Get sample ID in analysis dict
 4576                sample_subject = (
 4577                    param_exomiser_analysis_dict.get("phenopacket", {})
 4578                    .get("subject", {})
 4579                    .get("id", None)
 4580                )
 4581                sample_proband = (
 4582                    param_exomiser_analysis_dict.get("phenopacket", {})
 4583                    .get("proband", {})
 4584                    .get("subject", {})
 4585                    .get("id", None)
 4586                )
 4587                sample = []
 4588                if sample_subject:
 4589                    sample.append(sample_subject)
 4590                if sample_proband:
 4591                    sample.append(sample_proband)
 4592
 4593                # Get sample ID within Pedigree
 4594                pedigree_persons_list = (
 4595                    param_exomiser_analysis_dict.get("phenopacket", {})
 4596                    .get("pedigree", {})
 4597                    .get("persons", {})
 4598                )
 4599
 4600                # Create list with all sample ID in pedigree (if exists)
 4601                pedigree_persons = []
 4602                for person in pedigree_persons_list:
 4603                    pedigree_persons.append(person.get("individualId"))
 4604
 4605                # Concat subject sample ID and samples ID in pedigreesamples
 4606                samples = list(set(sample + pedigree_persons))
 4607
 4608                # Check if sample list is not empty
 4609                if not samples:
 4610                    log.error(f"No samples found")
 4611                    raise ValueError(f"No samples found")
 4612
 4613                # Create VCF with sample (either sample in param or first one by default)
 4614                # Export VCF file
 4615                self.export_variant_vcf(
 4616                    vcf_file=tmp_vcf_name,
 4617                    remove_info=True,
 4618                    add_samples=True,
 4619                    list_samples=samples,
 4620                    index=False,
 4621                )
 4622
 4623                ### Execute Exomiser ###
 4624                ########################
 4625
 4626                # Init command
 4627                exomiser_command = ""
 4628
 4629                # Command exomiser options
 4630                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4631
 4632                # Release
 4633                exomiser_release = param_exomiser.get("release", None)
 4634                if exomiser_release:
 4635                    # phenotype data version
 4636                    exomiser_options += (
 4637                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4638                    )
 4639                    # data version
 4640                    exomiser_options += (
 4641                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4642                    )
 4643                    # variant white list
 4644                    variant_white_list_file = (
 4645                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4646                    )
 4647                    if os.path.exists(
 4648                        os.path.join(
 4649                            databases_folders, assembly, variant_white_list_file
 4650                        )
 4651                    ):
 4652                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4653
 4654                # transcript_source
 4655                transcript_source = param_exomiser.get(
 4656                    "transcript_source", None
 4657                )  # ucsc, refseq, ensembl
 4658                if transcript_source:
 4659                    exomiser_options += (
 4660                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4661                    )
 4662
 4663                # If analysis contain proband param
 4664                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4665                    "proband", {}
 4666                ):
 4667                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4668
 4669                # If no proband (usually uniq sample)
 4670                else:
 4671                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4672
 4673                # Log
 4674                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4675
 4676                # Run command
 4677                result = subprocess.call(
 4678                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4679                )
 4680                if result:
 4681                    log.error("Exomiser command failed")
 4682                    raise ValueError("Exomiser command failed")
 4683
 4684                ### RESULTS ###
 4685                ###############
 4686
 4687                ### Annotate with TSV fields ###
 4688
 4689                # Init result tsv file
 4690                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4691
 4692                # Init result tsv file
 4693                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4694
 4695                # Parse TSV file and explode columns in INFO field
 4696                if exomiser_to_info and os.path.exists(output_results_tsv):
 4697
 4698                    # Log
 4699                    log.debug("Exomiser columns to VCF INFO field")
 4700
 4701                    # Retrieve columns and types
 4702                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4703                    output_results_tsv_df = self.get_query_to_df(query)
 4704                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4705
 4706                    # Init concat fields for update
 4707                    sql_query_update_concat_fields = []
 4708
 4709                    # Fields to avoid
 4710                    fields_to_avoid = [
 4711                        "CONTIG",
 4712                        "START",
 4713                        "END",
 4714                        "REF",
 4715                        "ALT",
 4716                        "QUAL",
 4717                        "FILTER",
 4718                        "GENOTYPE",
 4719                    ]
 4720
 4721                    # List all columns to add into header
 4722                    for header_column in output_results_tsv_columns:
 4723
 4724                        # If header column is enable
 4725                        if header_column not in fields_to_avoid:
 4726
 4727                            # Header info type
 4728                            header_info_type = "String"
 4729                            header_column_df = output_results_tsv_df[header_column]
 4730                            header_column_df_dtype = header_column_df.dtype
 4731                            if header_column_df_dtype == object:
 4732                                if (
 4733                                    pd.to_numeric(header_column_df, errors="coerce")
 4734                                    .notnull()
 4735                                    .all()
 4736                                ):
 4737                                    header_info_type = "Float"
 4738                            else:
 4739                                header_info_type = "Integer"
 4740
 4741                            # Header info
 4742                            characters_to_validate = ["-"]
 4743                            pattern = "[" + "".join(characters_to_validate) + "]"
 4744                            header_info_name = re.sub(
 4745                                pattern,
 4746                                "_",
 4747                                f"Exomiser_{header_column}".replace("#", ""),
 4748                            )
 4749                            header_info_number = "."
 4750                            header_info_description = (
 4751                                f"Exomiser {header_column} annotation"
 4752                            )
 4753                            header_info_source = "Exomiser"
 4754                            header_info_version = "unknown"
 4755                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4756                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4757                                header_info_name,
 4758                                header_info_number,
 4759                                header_info_type,
 4760                                header_info_description,
 4761                                header_info_source,
 4762                                header_info_version,
 4763                                header_info_code,
 4764                            )
 4765
 4766                            # Add field to add for update to concat fields
 4767                            sql_query_update_concat_fields.append(
 4768                                f"""
 4769                                CASE
 4770                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4771                                    THEN concat(
 4772                                        '{header_info_name}=',
 4773                                        table_parquet."{header_column}",
 4774                                        ';'
 4775                                        )
 4776
 4777                                    ELSE ''
 4778                                END
 4779                            """
 4780                            )
 4781
 4782                    # Update query
 4783                    sql_query_update = f"""
 4784                        UPDATE {table_variants} as table_variants
 4785                            SET INFO = concat(
 4786                                            CASE
 4787                                                WHEN INFO NOT IN ('', '.')
 4788                                                THEN INFO
 4789                                                ELSE ''
 4790                                            END,
 4791                                            CASE
 4792                                                WHEN table_variants.INFO NOT IN ('','.')
 4793                                                THEN ';'
 4794                                                ELSE ''
 4795                                            END,
 4796                                            (
 4797                                            SELECT 
 4798                                                concat(
 4799                                                    {",".join(sql_query_update_concat_fields)}
 4800                                                )
 4801                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4802                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4803                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4804                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4805                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4806                                            )
 4807                                        )
 4808                            ;
 4809                        """
 4810
 4811                    # Update
 4812                    self.conn.execute(sql_query_update)
 4813
 4814                ### Annotate with VCF INFO field ###
 4815
 4816                # Init result VCF file
 4817                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4818
 4819                # If VCF exists
 4820                if os.path.exists(output_results_vcf):
 4821
 4822                    # Log
 4823                    log.debug("Exomiser result VCF update variants")
 4824
 4825                    # Find Exomiser INFO field annotation in header
 4826                    with gzip.open(output_results_vcf, "rt") as f:
 4827                        header_list = self.read_vcf_header(f)
 4828                    exomiser_vcf_header = vcf.Reader(
 4829                        io.StringIO("\n".join(header_list))
 4830                    )
 4831
 4832                    # Add annotation INFO field to header
 4833                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4834
 4835                    # Update variants with VCF
 4836                    self.update_from_vcf(output_results_vcf)
 4837
 4838        return True
 4839
 4840    def annotation_snpeff(self, threads: int = None) -> None:
 4841        """
 4842        This function annotate with snpEff
 4843
 4844        :param threads: The number of threads to use
 4845        :return: the value of the variable "return_value".
 4846        """
 4847
 4848        # DEBUG
 4849        log.debug("Start annotation with snpeff databases")
 4850
 4851        # Threads
 4852        if not threads:
 4853            threads = self.get_threads()
 4854        log.debug("Threads: " + str(threads))
 4855
 4856        # DEBUG
 4857        delete_tmp = True
 4858        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4859            delete_tmp = False
 4860            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4861
 4862        # Config
 4863        config = self.get_config()
 4864        log.debug("Config: " + str(config))
 4865
 4866        # Config - Folders - Databases
 4867        databases_folders = (
 4868            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4869        )
 4870        log.debug("Databases annotations: " + str(databases_folders))
 4871
 4872        # # Config - Java
 4873        # java_bin = get_bin(
 4874        #     tool="java",
 4875        #     bin="java",
 4876        #     bin_type="bin",
 4877        #     config=config,
 4878        #     default_folder="/usr/bin",
 4879        # )
 4880        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4881        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4882        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4883
 4884        # # Config - snpEff bin
 4885        # snpeff_jar = get_bin(
 4886        #     tool="snpeff",
 4887        #     bin="snpEff.jar",
 4888        #     bin_type="jar",
 4889        #     config=config,
 4890        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4891        # )
 4892        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4893        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4894        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4895
 4896        # Config - snpEff bin command
 4897        snpeff_bin_command = get_bin_command(
 4898            bin="snpEff.jar",
 4899            tool="snpeff",
 4900            bin_type="jar",
 4901            config=config,
 4902            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4903        )
 4904        if not snpeff_bin_command:
 4905            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4906            log.error(msg_err)
 4907            raise ValueError(msg_err)
 4908
 4909        # Config - snpEff databases
 4910        snpeff_databases = (
 4911            config.get("folders", {})
 4912            .get("databases", {})
 4913            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4914        )
 4915        snpeff_databases = full_path(snpeff_databases)
 4916        if snpeff_databases is not None and snpeff_databases != "":
 4917            log.debug(f"Create snpEff databases folder")
 4918            if not os.path.exists(snpeff_databases):
 4919                os.makedirs(snpeff_databases)
 4920
 4921        # Param
 4922        param = self.get_param()
 4923        log.debug("Param: " + str(param))
 4924
 4925        # Param
 4926        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4927        log.debug("Options: " + str(options))
 4928
 4929        # Param - Assembly
 4930        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4931
 4932        # Param - Options
 4933        snpeff_options = (
 4934            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4935        )
 4936        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4937        snpeff_csvstats = (
 4938            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4939        )
 4940        if snpeff_stats:
 4941            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4942            snpeff_stats = full_path(snpeff_stats)
 4943            snpeff_options += f" -stats {snpeff_stats}"
 4944        if snpeff_csvstats:
 4945            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4946            snpeff_csvstats = full_path(snpeff_csvstats)
 4947            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4948
 4949        # Data
 4950        table_variants = self.get_table_variants()
 4951
 4952        # Check if not empty
 4953        log.debug("Check if not empty")
 4954        sql_query_chromosomes = (
 4955            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4956        )
 4957        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4958        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4959            log.info(f"VCF empty")
 4960            return
 4961
 4962        # Export in VCF
 4963        log.debug("Create initial file to annotate")
 4964        tmp_vcf = NamedTemporaryFile(
 4965            prefix=self.get_prefix(),
 4966            dir=self.get_tmp_dir(),
 4967            suffix=".vcf.gz",
 4968            delete=True,
 4969        )
 4970        tmp_vcf_name = tmp_vcf.name
 4971
 4972        # VCF header
 4973        vcf_reader = self.get_header()
 4974        log.debug("Initial header: " + str(vcf_reader.infos))
 4975
 4976        # Existing annotations
 4977        for vcf_annotation in self.get_header().infos:
 4978
 4979            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4980            log.debug(
 4981                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4982            )
 4983
 4984        # Memory limit
 4985        # if config.get("memory", None):
 4986        #     memory_limit = config.get("memory", "8G")
 4987        # else:
 4988        #     memory_limit = "8G"
 4989        memory_limit = self.get_memory("8G")
 4990        log.debug(f"memory_limit: {memory_limit}")
 4991
 4992        # snpEff java options
 4993        snpeff_java_options = (
 4994            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4995        )
 4996        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4997
 4998        force_update_annotation = True
 4999
 5000        if "ANN" not in self.get_header().infos or force_update_annotation:
 5001
 5002            # Check snpEff database
 5003            log.debug(f"Check snpEff databases {[assembly]}")
 5004            databases_download_snpeff(
 5005                folder=snpeff_databases, assemblies=[assembly], config=config
 5006            )
 5007
 5008            # Export VCF file
 5009            self.export_variant_vcf(
 5010                vcf_file=tmp_vcf_name,
 5011                remove_info=True,
 5012                add_samples=False,
 5013                index=True,
 5014            )
 5015
 5016            # Tmp file
 5017            err_files = []
 5018            tmp_annotate_vcf = NamedTemporaryFile(
 5019                prefix=self.get_prefix(),
 5020                dir=self.get_tmp_dir(),
 5021                suffix=".vcf",
 5022                delete=False,
 5023            )
 5024            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5025            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5026            err_files.append(tmp_annotate_vcf_name_err)
 5027
 5028            # Command
 5029            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5030            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5031            run_parallel_commands([snpeff_command], 1)
 5032
 5033            # Error messages
 5034            log.info(f"Error/Warning messages:")
 5035            error_message_command_all = []
 5036            error_message_command_warning = []
 5037            error_message_command_err = []
 5038            for err_file in err_files:
 5039                with open(err_file, "r") as f:
 5040                    for line in f:
 5041                        message = line.strip()
 5042                        error_message_command_all.append(message)
 5043                        if line.startswith("[W::"):
 5044                            error_message_command_warning.append(message)
 5045                        if line.startswith("[E::"):
 5046                            error_message_command_err.append(f"{err_file}: " + message)
 5047            # log info
 5048            for message in list(
 5049                set(error_message_command_err + error_message_command_warning)
 5050            ):
 5051                log.info(f"   {message}")
 5052            # debug info
 5053            for message in list(set(error_message_command_all)):
 5054                log.debug(f"   {message}")
 5055            # failed
 5056            if len(error_message_command_err):
 5057                log.error("Annotation failed: Error in commands")
 5058                raise ValueError("Annotation failed: Error in commands")
 5059
 5060            # Find annotation in header
 5061            with open(tmp_annotate_vcf_name, "rt") as f:
 5062                header_list = self.read_vcf_header(f)
 5063            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5064
 5065            for ann in annovar_vcf_header.infos:
 5066                if ann not in self.get_header().infos:
 5067                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5068
 5069            # Update variants
 5070            log.info(f"Annotation - Updating...")
 5071            self.update_from_vcf(tmp_annotate_vcf_name)
 5072
 5073        else:
 5074            if "ANN" in self.get_header().infos:
 5075                log.debug(f"Existing snpEff annotations in VCF")
 5076            if force_update_annotation:
 5077                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5078
 5079    def annotation_annovar(self, threads: int = None) -> None:
 5080        """
 5081        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5082        annotations
 5083
 5084        :param threads: number of threads to use
 5085        :return: the value of the variable "return_value".
 5086        """
 5087
 5088        # DEBUG
 5089        log.debug("Start annotation with Annovar databases")
 5090
 5091        # Threads
 5092        if not threads:
 5093            threads = self.get_threads()
 5094        log.debug("Threads: " + str(threads))
 5095
 5096        # Tmp en Err files
 5097        tmp_files = []
 5098        err_files = []
 5099
 5100        # DEBUG
 5101        delete_tmp = True
 5102        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5103            delete_tmp = False
 5104            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5105
 5106        # Config
 5107        config = self.get_config()
 5108        log.debug("Config: " + str(config))
 5109
 5110        # Config - Folders - Databases
 5111        databases_folders = (
 5112            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5113        )
 5114        log.debug("Databases annotations: " + str(databases_folders))
 5115
 5116        # Config - annovar bin command
 5117        annovar_bin_command = get_bin_command(
 5118            bin="table_annovar.pl",
 5119            tool="annovar",
 5120            bin_type="perl",
 5121            config=config,
 5122            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5123        )
 5124        if not annovar_bin_command:
 5125            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5126            log.error(msg_err)
 5127            raise ValueError(msg_err)
 5128
 5129        # Config - BCFTools bin command
 5130        bcftools_bin_command = get_bin_command(
 5131            bin="bcftools",
 5132            tool="bcftools",
 5133            bin_type="bin",
 5134            config=config,
 5135            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5136        )
 5137        if not bcftools_bin_command:
 5138            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5139            log.error(msg_err)
 5140            raise ValueError(msg_err)
 5141
 5142        # Config - annovar databases
 5143        annovar_databases = (
 5144            config.get("folders", {})
 5145            .get("databases", {})
 5146            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5147        )
 5148        annovar_databases = full_path(annovar_databases)
 5149        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5150            os.makedirs(annovar_databases)
 5151
 5152        # Param
 5153        param = self.get_param()
 5154        log.debug("Param: " + str(param))
 5155
 5156        # Param - options
 5157        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5158        log.debug("Options: " + str(options))
 5159
 5160        # Param - annotations
 5161        annotations = (
 5162            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5163        )
 5164        log.debug("Annotations: " + str(annotations))
 5165
 5166        # Param - Assembly
 5167        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5168
 5169        # Annovar database assembly
 5170        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5171        if annovar_databases_assembly != "" and not os.path.exists(
 5172            annovar_databases_assembly
 5173        ):
 5174            os.makedirs(annovar_databases_assembly)
 5175
 5176        # Data
 5177        table_variants = self.get_table_variants()
 5178
 5179        # Check if not empty
 5180        log.debug("Check if not empty")
 5181        sql_query_chromosomes = (
 5182            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5183        )
 5184        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5185        if not sql_query_chromosomes_df["count"][0]:
 5186            log.info(f"VCF empty")
 5187            return
 5188
 5189        # VCF header
 5190        vcf_reader = self.get_header()
 5191        log.debug("Initial header: " + str(vcf_reader.infos))
 5192
 5193        # Existing annotations
 5194        for vcf_annotation in self.get_header().infos:
 5195
 5196            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5197            log.debug(
 5198                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5199            )
 5200
 5201        force_update_annotation = True
 5202
 5203        if annotations:
 5204
 5205            commands = []
 5206            tmp_annotates_vcf_name_list = []
 5207
 5208            # Export in VCF
 5209            log.debug("Create initial file to annotate")
 5210            tmp_vcf = NamedTemporaryFile(
 5211                prefix=self.get_prefix(),
 5212                dir=self.get_tmp_dir(),
 5213                suffix=".vcf.gz",
 5214                delete=False,
 5215            )
 5216            tmp_vcf_name = tmp_vcf.name
 5217            tmp_files.append(tmp_vcf_name)
 5218            tmp_files.append(tmp_vcf_name + ".tbi")
 5219
 5220            # Export VCF file
 5221            self.export_variant_vcf(
 5222                vcf_file=tmp_vcf_name,
 5223                remove_info=".",
 5224                add_samples=False,
 5225                index=True,
 5226            )
 5227
 5228            # Create file for field rename
 5229            log.debug("Create file for field rename")
 5230            tmp_rename = NamedTemporaryFile(
 5231                prefix=self.get_prefix(),
 5232                dir=self.get_tmp_dir(),
 5233                suffix=".rename",
 5234                delete=False,
 5235            )
 5236            tmp_rename_name = tmp_rename.name
 5237            tmp_files.append(tmp_rename_name)
 5238
 5239            # Check Annovar database
 5240            log.debug(
 5241                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5242            )
 5243            databases_download_annovar(
 5244                folder=annovar_databases,
 5245                files=list(annotations.keys()),
 5246                assemblies=[assembly],
 5247            )
 5248
 5249            for annotation in annotations:
 5250                annotation_fields = annotations[annotation]
 5251
 5252                if not annotation_fields:
 5253                    annotation_fields = {"INFO": None}
 5254
 5255                log.info(f"Annotations Annovar - database '{annotation}'")
 5256                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5257
 5258                # Tmp file for annovar
 5259                err_files = []
 5260                tmp_annotate_vcf_directory = TemporaryDirectory(
 5261                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5262                )
 5263                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5264                tmp_annotate_vcf_name_annovar = (
 5265                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5266                )
 5267                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5268                err_files.append(tmp_annotate_vcf_name_err)
 5269                tmp_files.append(tmp_annotate_vcf_name_err)
 5270
 5271                # Tmp file final vcf annotated by annovar
 5272                tmp_annotate_vcf = NamedTemporaryFile(
 5273                    prefix=self.get_prefix(),
 5274                    dir=self.get_tmp_dir(),
 5275                    suffix=".vcf.gz",
 5276                    delete=False,
 5277                )
 5278                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5279                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5280                tmp_files.append(tmp_annotate_vcf_name)
 5281                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5282
 5283                # Number of fields
 5284                annotation_list = []
 5285                annotation_renamed_list = []
 5286
 5287                for annotation_field in annotation_fields:
 5288
 5289                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5290                    annotation_fields_new_name = annotation_fields.get(
 5291                        annotation_field, annotation_field
 5292                    )
 5293                    if not annotation_fields_new_name:
 5294                        annotation_fields_new_name = annotation_field
 5295
 5296                    if (
 5297                        force_update_annotation
 5298                        or annotation_fields_new_name not in self.get_header().infos
 5299                    ):
 5300                        annotation_list.append(annotation_field)
 5301                        annotation_renamed_list.append(annotation_fields_new_name)
 5302                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5303                        log.warning(
 5304                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5305                        )
 5306
 5307                    # Add rename info
 5308                    run_parallel_commands(
 5309                        [
 5310                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5311                        ],
 5312                        1,
 5313                    )
 5314
 5315                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5316                log.debug("annotation_list: " + str(annotation_list))
 5317
 5318                # protocol
 5319                protocol = annotation
 5320
 5321                # argument
 5322                argument = ""
 5323
 5324                # operation
 5325                operation = "f"
 5326                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5327                    "ensGene"
 5328                ):
 5329                    operation = "g"
 5330                    if options.get("genebase", None):
 5331                        argument = f"""'{options.get("genebase","")}'"""
 5332                elif annotation in ["cytoBand"]:
 5333                    operation = "r"
 5334
 5335                # argument option
 5336                argument_option = ""
 5337                if argument != "":
 5338                    argument_option = " --argument " + argument
 5339
 5340                # command options
 5341                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5342                for option in options:
 5343                    if option not in ["genebase"]:
 5344                        command_options += f""" --{option}={options[option]}"""
 5345
 5346                # Command
 5347
 5348                # Command - Annovar
 5349                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5350                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5351
 5352                # Command - start pipe
 5353                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5354
 5355                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5356                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5357
 5358                # Command - Special characters (refGene annotation)
 5359                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5360
 5361                # Command - Clean empty fields (with value ".")
 5362                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5363
 5364                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5365                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5366                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5367                    # for ann in annotation_renamed_list:
 5368                    for ann in annotation_list:
 5369                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5370
 5371                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5372
 5373                # Command - indexing
 5374                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5375
 5376                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5377                run_parallel_commands([command_annovar], 1)
 5378
 5379                # Error messages
 5380                log.info(f"Error/Warning messages:")
 5381                error_message_command_all = []
 5382                error_message_command_warning = []
 5383                error_message_command_err = []
 5384                for err_file in err_files:
 5385                    with open(err_file, "r") as f:
 5386                        for line in f:
 5387                            message = line.strip()
 5388                            error_message_command_all.append(message)
 5389                            if line.startswith("[W::") or line.startswith("WARNING"):
 5390                                error_message_command_warning.append(message)
 5391                            if line.startswith("[E::") or line.startswith("ERROR"):
 5392                                error_message_command_err.append(
 5393                                    f"{err_file}: " + message
 5394                                )
 5395                # log info
 5396                for message in list(
 5397                    set(error_message_command_err + error_message_command_warning)
 5398                ):
 5399                    log.info(f"   {message}")
 5400                # debug info
 5401                for message in list(set(error_message_command_all)):
 5402                    log.debug(f"   {message}")
 5403                # failed
 5404                if len(error_message_command_err):
 5405                    log.error("Annotation failed: Error in commands")
 5406                    raise ValueError("Annotation failed: Error in commands")
 5407
 5408            if tmp_annotates_vcf_name_list:
 5409
 5410                # List of annotated files
 5411                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5412
 5413                # Tmp file
 5414                tmp_annotate_vcf = NamedTemporaryFile(
 5415                    prefix=self.get_prefix(),
 5416                    dir=self.get_tmp_dir(),
 5417                    suffix=".vcf.gz",
 5418                    delete=False,
 5419                )
 5420                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5421                tmp_files.append(tmp_annotate_vcf_name)
 5422                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5423                err_files.append(tmp_annotate_vcf_name_err)
 5424                tmp_files.append(tmp_annotate_vcf_name_err)
 5425
 5426                # Command merge
 5427                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5428                log.info(
 5429                    f"Annotation Annovar - Annotation merging "
 5430                    + str(len(tmp_annotates_vcf_name_list))
 5431                    + " annotated files"
 5432                )
 5433                log.debug(f"Annotation - merge command: {merge_command}")
 5434                run_parallel_commands([merge_command], 1)
 5435
 5436                # Find annotation in header
 5437                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5438                    header_list = self.read_vcf_header(f)
 5439                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5440
 5441                for ann in annovar_vcf_header.infos:
 5442                    if ann not in self.get_header().infos:
 5443                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5444
 5445                # Update variants
 5446                log.info(f"Annotation Annovar - Updating...")
 5447                self.update_from_vcf(tmp_annotate_vcf_name)
 5448
 5449            # Clean files
 5450            # Tmp file remove command
 5451            if True:
 5452                tmp_files_remove_command = ""
 5453                if tmp_files:
 5454                    tmp_files_remove_command = " ".join(tmp_files)
 5455                clean_command = f" rm -f {tmp_files_remove_command} "
 5456                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5457                log.debug(f"Annotation - cleaning command: {clean_command}")
 5458                run_parallel_commands([clean_command], 1)
 5459
 5460    # Parquet
 5461    def annotation_parquet(self, threads: int = None) -> None:
 5462        """
 5463        It takes a VCF file, and annotates it with a parquet file
 5464
 5465        :param threads: number of threads to use for the annotation
 5466        :return: the value of the variable "result".
 5467        """
 5468
 5469        # DEBUG
 5470        log.debug("Start annotation with parquet databases")
 5471
 5472        # Threads
 5473        if not threads:
 5474            threads = self.get_threads()
 5475        log.debug("Threads: " + str(threads))
 5476
 5477        # DEBUG
 5478        delete_tmp = True
 5479        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5480            delete_tmp = False
 5481            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5482
 5483        # Config
 5484        databases_folders = set(
 5485            self.get_config()
 5486            .get("folders", {})
 5487            .get("databases", {})
 5488            .get("annotations", ["."])
 5489            + self.get_config()
 5490            .get("folders", {})
 5491            .get("databases", {})
 5492            .get("parquet", ["."])
 5493        )
 5494        log.debug("Databases annotations: " + str(databases_folders))
 5495
 5496        # Param
 5497        annotations = (
 5498            self.get_param()
 5499            .get("annotation", {})
 5500            .get("parquet", {})
 5501            .get("annotations", None)
 5502        )
 5503        log.debug("Annotations: " + str(annotations))
 5504
 5505        # Assembly
 5506        assembly = self.get_param().get(
 5507            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5508        )
 5509
 5510        # Force Update Annotation
 5511        force_update_annotation = (
 5512            self.get_param()
 5513            .get("annotation", {})
 5514            .get("options", {})
 5515            .get("annotations_update", False)
 5516        )
 5517        log.debug(f"force_update_annotation={force_update_annotation}")
 5518        force_append_annotation = (
 5519            self.get_param()
 5520            .get("annotation", {})
 5521            .get("options", {})
 5522            .get("annotations_append", False)
 5523        )
 5524        log.debug(f"force_append_annotation={force_append_annotation}")
 5525
 5526        # Data
 5527        table_variants = self.get_table_variants()
 5528
 5529        # Check if not empty
 5530        log.debug("Check if not empty")
 5531        sql_query_chromosomes_df = self.get_query_to_df(
 5532            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5533        )
 5534        if not sql_query_chromosomes_df["count"][0]:
 5535            log.info(f"VCF empty")
 5536            return
 5537
 5538        # VCF header
 5539        vcf_reader = self.get_header()
 5540        log.debug("Initial header: " + str(vcf_reader.infos))
 5541
 5542        # Nb Variants POS
 5543        log.debug("NB Variants Start")
 5544        nb_variants = self.conn.execute(
 5545            f"SELECT count(*) AS count FROM variants"
 5546        ).fetchdf()["count"][0]
 5547        log.debug("NB Variants Stop")
 5548
 5549        # Existing annotations
 5550        for vcf_annotation in self.get_header().infos:
 5551
 5552            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5553            log.debug(
 5554                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5555            )
 5556
 5557        # Added columns
 5558        added_columns = []
 5559
 5560        # drop indexes
 5561        log.debug(f"Drop indexes...")
 5562        self.drop_indexes()
 5563
 5564        if annotations:
 5565
 5566            if "ALL" in annotations:
 5567
 5568                all_param = annotations.get("ALL", {})
 5569                all_param_formats = all_param.get("formats", None)
 5570                all_param_releases = all_param.get("releases", None)
 5571
 5572                databases_infos_dict = self.scan_databases(
 5573                    database_formats=all_param_formats,
 5574                    database_releases=all_param_releases,
 5575                )
 5576                for database_infos in databases_infos_dict.keys():
 5577                    if database_infos not in annotations:
 5578                        annotations[database_infos] = {"INFO": None}
 5579
 5580            for annotation in annotations:
 5581
 5582                if annotation in ["ALL"]:
 5583                    continue
 5584
 5585                # Annotation Name
 5586                annotation_name = os.path.basename(annotation)
 5587
 5588                # Annotation fields
 5589                annotation_fields = annotations[annotation]
 5590                if not annotation_fields:
 5591                    annotation_fields = {"INFO": None}
 5592
 5593                log.debug(f"Annotation '{annotation_name}'")
 5594                log.debug(
 5595                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5596                )
 5597
 5598                # Create Database
 5599                database = Database(
 5600                    database=annotation,
 5601                    databases_folders=databases_folders,
 5602                    assembly=assembly,
 5603                )
 5604
 5605                # Find files
 5606                parquet_file = database.get_database()
 5607                parquet_hdr_file = database.get_header_file()
 5608                parquet_type = database.get_type()
 5609
 5610                # Check if files exists
 5611                if not parquet_file or not parquet_hdr_file:
 5612                    log.error("Annotation failed: file not found")
 5613                    raise ValueError("Annotation failed: file not found")
 5614                else:
 5615                    # Get parquet connexion
 5616                    parquet_sql_attach = database.get_sql_database_attach(
 5617                        output="query"
 5618                    )
 5619                    if parquet_sql_attach:
 5620                        self.conn.execute(parquet_sql_attach)
 5621                    parquet_file_link = database.get_sql_database_link()
 5622                    # Log
 5623                    log.debug(
 5624                        f"Annotation '{annotation_name}' - file: "
 5625                        + str(parquet_file)
 5626                        + " and "
 5627                        + str(parquet_hdr_file)
 5628                    )
 5629
 5630                    # Database full header columns
 5631                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5632                        parquet_hdr_file
 5633                    )
 5634                    # Log
 5635                    log.debug(
 5636                        "Annotation database header columns : "
 5637                        + str(parquet_hdr_vcf_header_columns)
 5638                    )
 5639
 5640                    # Load header as VCF object
 5641                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5642                    # Log
 5643                    log.debug(
 5644                        "Annotation database header: "
 5645                        + str(parquet_hdr_vcf_header_infos)
 5646                    )
 5647
 5648                    # Get extra infos
 5649                    parquet_columns = database.get_extra_columns()
 5650                    # Log
 5651                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5652
 5653                    # Add extra columns if "ALL" in annotation_fields
 5654                    # if "ALL" in annotation_fields:
 5655                    #     allow_add_extra_column = True
 5656                    if "ALL" in annotation_fields and database.get_extra_columns():
 5657                        for extra_column in database.get_extra_columns():
 5658                            if (
 5659                                extra_column not in annotation_fields
 5660                                and extra_column.replace("INFO/", "")
 5661                                not in parquet_hdr_vcf_header_infos
 5662                            ):
 5663                                parquet_hdr_vcf_header_infos[extra_column] = (
 5664                                    vcf.parser._Info(
 5665                                        extra_column,
 5666                                        ".",
 5667                                        "String",
 5668                                        f"{extra_column} description",
 5669                                        "unknown",
 5670                                        "unknown",
 5671                                        self.code_type_map["String"],
 5672                                    )
 5673                                )
 5674
 5675                    # For all fields in database
 5676                    annotation_fields_all = False
 5677                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5678                        annotation_fields_all = True
 5679                        annotation_fields = {
 5680                            key: key for key in parquet_hdr_vcf_header_infos
 5681                        }
 5682
 5683                        log.debug(
 5684                            "Annotation database header - All annotations added: "
 5685                            + str(annotation_fields)
 5686                        )
 5687
 5688                    # Init
 5689
 5690                    # List of annotation fields to use
 5691                    sql_query_annotation_update_info_sets = []
 5692
 5693                    # List of annotation to agregate
 5694                    sql_query_annotation_to_agregate = []
 5695
 5696                    # Number of fields
 5697                    nb_annotation_field = 0
 5698
 5699                    # Annotation fields processed
 5700                    annotation_fields_processed = []
 5701
 5702                    # Columns mapping
 5703                    map_columns = database.map_columns(
 5704                        columns=annotation_fields, prefixes=["INFO/"]
 5705                    )
 5706
 5707                    # Query dict for fields to remove (update option)
 5708                    query_dict_remove = {}
 5709
 5710                    # Fetch Anotation fields
 5711                    for annotation_field in annotation_fields:
 5712
 5713                        # annotation_field_column
 5714                        annotation_field_column = map_columns.get(
 5715                            annotation_field, "INFO"
 5716                        )
 5717
 5718                        # field new name, if parametered
 5719                        annotation_fields_new_name = annotation_fields.get(
 5720                            annotation_field, annotation_field
 5721                        )
 5722                        if not annotation_fields_new_name:
 5723                            annotation_fields_new_name = annotation_field
 5724
 5725                        # To annotate
 5726                        # force_update_annotation = True
 5727                        # force_append_annotation = True
 5728                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5729                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5730                            force_update_annotation
 5731                            or force_append_annotation
 5732                            or (
 5733                                annotation_fields_new_name
 5734                                not in self.get_header().infos
 5735                            )
 5736                        ):
 5737
 5738                            # Add field to annotation to process list
 5739                            annotation_fields_processed.append(
 5740                                annotation_fields_new_name
 5741                            )
 5742
 5743                            # explode infos for the field
 5744                            annotation_fields_new_name_info_msg = ""
 5745                            if (
 5746                                force_update_annotation
 5747                                and annotation_fields_new_name
 5748                                in self.get_header().infos
 5749                            ):
 5750                                # Remove field from INFO
 5751                                query = f"""
 5752                                    UPDATE {table_variants} as table_variants
 5753                                    SET INFO = REGEXP_REPLACE(
 5754                                                concat(table_variants.INFO,''),
 5755                                                ';*{annotation_fields_new_name}=[^;]*',
 5756                                                ''
 5757                                                )
 5758                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5759                                """
 5760                                annotation_fields_new_name_info_msg = " [update]"
 5761                                query_dict_remove[
 5762                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5763                                ] = query
 5764
 5765                            # Sep between fields in INFO
 5766                            nb_annotation_field += 1
 5767                            if nb_annotation_field > 1:
 5768                                annotation_field_sep = ";"
 5769                            else:
 5770                                annotation_field_sep = ""
 5771
 5772                            log.info(
 5773                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5774                            )
 5775
 5776                            # Add INFO field to header
 5777                            parquet_hdr_vcf_header_infos_number = (
 5778                                parquet_hdr_vcf_header_infos[annotation_field].num
 5779                                or "."
 5780                            )
 5781                            parquet_hdr_vcf_header_infos_type = (
 5782                                parquet_hdr_vcf_header_infos[annotation_field].type
 5783                                or "String"
 5784                            )
 5785                            parquet_hdr_vcf_header_infos_description = (
 5786                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5787                                or f"{annotation_field} description"
 5788                            )
 5789                            parquet_hdr_vcf_header_infos_source = (
 5790                                parquet_hdr_vcf_header_infos[annotation_field].source
 5791                                or "unknown"
 5792                            )
 5793                            parquet_hdr_vcf_header_infos_version = (
 5794                                parquet_hdr_vcf_header_infos[annotation_field].version
 5795                                or "unknown"
 5796                            )
 5797
 5798                            vcf_reader.infos[annotation_fields_new_name] = (
 5799                                vcf.parser._Info(
 5800                                    annotation_fields_new_name,
 5801                                    parquet_hdr_vcf_header_infos_number,
 5802                                    parquet_hdr_vcf_header_infos_type,
 5803                                    parquet_hdr_vcf_header_infos_description,
 5804                                    parquet_hdr_vcf_header_infos_source,
 5805                                    parquet_hdr_vcf_header_infos_version,
 5806                                    self.code_type_map[
 5807                                        parquet_hdr_vcf_header_infos_type
 5808                                    ],
 5809                                )
 5810                            )
 5811
 5812                            # Append
 5813                            if force_append_annotation:
 5814                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5815                            else:
 5816                                query_case_when_append = ""
 5817
 5818                            # Annotation/Update query fields
 5819                            # Found in INFO column
 5820                            if (
 5821                                annotation_field_column == "INFO"
 5822                                and "INFO" in parquet_hdr_vcf_header_columns
 5823                            ):
 5824                                sql_query_annotation_update_info_sets.append(
 5825                                    f"""
 5826                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5827                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5828                                        ELSE ''
 5829                                    END
 5830                                """
 5831                                )
 5832                            # Found in a specific column
 5833                            else:
 5834                                sql_query_annotation_update_info_sets.append(
 5835                                    f"""
 5836                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 5837                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5838                                        ELSE ''
 5839                                    END
 5840                                """
 5841                                )
 5842                                sql_query_annotation_to_agregate.append(
 5843                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5844                                )
 5845
 5846                        # Not to annotate
 5847                        else:
 5848
 5849                            if force_update_annotation:
 5850                                annotation_message = "forced"
 5851                            else:
 5852                                annotation_message = "skipped"
 5853
 5854                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5855                                log.warning(
 5856                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5857                                )
 5858                            if annotation_fields_new_name in self.get_header().infos:
 5859                                log.warning(
 5860                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5861                                )
 5862
 5863                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5864                    # allow_annotation_full_info = True
 5865                    allow_annotation_full_info = not force_append_annotation
 5866
 5867                    if parquet_type in ["regions"]:
 5868                        allow_annotation_full_info = False
 5869
 5870                    if (
 5871                        allow_annotation_full_info
 5872                        and nb_annotation_field == len(annotation_fields)
 5873                        and annotation_fields_all
 5874                        and (
 5875                            "INFO" in parquet_hdr_vcf_header_columns
 5876                            and "INFO" in database.get_extra_columns()
 5877                        )
 5878                    ):
 5879                        log.debug("Column INFO annotation enabled")
 5880                        sql_query_annotation_update_info_sets = []
 5881                        sql_query_annotation_update_info_sets.append(
 5882                            f" table_parquet.INFO "
 5883                        )
 5884
 5885                    if sql_query_annotation_update_info_sets:
 5886
 5887                        # Annotate
 5888                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5889
 5890                        # Join query annotation update info sets for SQL
 5891                        sql_query_annotation_update_info_sets_sql = ",".join(
 5892                            sql_query_annotation_update_info_sets
 5893                        )
 5894
 5895                        # Check chromosomes list (and variants infos)
 5896                        sql_query_chromosomes = f"""
 5897                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5898                            FROM {table_variants} as table_variants
 5899                            GROUP BY table_variants."#CHROM"
 5900                            ORDER BY table_variants."#CHROM"
 5901                            """
 5902                        sql_query_chromosomes_df = self.conn.execute(
 5903                            sql_query_chromosomes
 5904                        ).df()
 5905                        sql_query_chromosomes_dict = {
 5906                            entry["CHROM"]: {
 5907                                "count": entry["count_variants"],
 5908                                "min": entry["min_variants"],
 5909                                "max": entry["max_variants"],
 5910                            }
 5911                            for index, entry in sql_query_chromosomes_df.iterrows()
 5912                        }
 5913
 5914                        # Init
 5915                        nb_of_query = 0
 5916                        nb_of_variant_annotated = 0
 5917                        query_dict = query_dict_remove
 5918
 5919                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5920                        for chrom in sql_query_chromosomes_dict:
 5921
 5922                            # Number of variant by chromosome
 5923                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5924                                chrom, {}
 5925                            ).get("count", 0)
 5926
 5927                            log.debug(
 5928                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5929                            )
 5930
 5931                            # Annotation with regions database
 5932                            if parquet_type in ["regions"]:
 5933                                sql_query_annotation_from_clause = f"""
 5934                                    FROM (
 5935                                        SELECT 
 5936                                            '{chrom}' AS \"#CHROM\",
 5937                                            table_variants_from.\"POS\" AS \"POS\",
 5938                                            {",".join(sql_query_annotation_to_agregate)}
 5939                                        FROM {table_variants} as table_variants_from
 5940                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5941                                            table_parquet_from."#CHROM" = '{chrom}'
 5942                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5943                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5944                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5945                                                )
 5946                                        )
 5947                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5948                                        GROUP BY table_variants_from.\"POS\"
 5949                                        )
 5950                                        as table_parquet
 5951                                """
 5952
 5953                                sql_query_annotation_where_clause = """
 5954                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5955                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5956                                """
 5957
 5958                            # Annotation with variants database
 5959                            else:
 5960                                sql_query_annotation_from_clause = f"""
 5961                                    FROM {parquet_file_link} as table_parquet
 5962                                """
 5963                                sql_query_annotation_where_clause = f"""
 5964                                    table_variants."#CHROM" = '{chrom}'
 5965                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5966                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5967                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5968                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5969                                """
 5970
 5971                            # Create update query
 5972                            sql_query_annotation_chrom_interval_pos = f"""
 5973                                UPDATE {table_variants} as table_variants
 5974                                    SET INFO = 
 5975                                        concat(
 5976                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5977                                                THEN table_variants.INFO
 5978                                                ELSE ''
 5979                                            END
 5980                                            ,
 5981                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5982                                                        AND (
 5983                                                        concat({sql_query_annotation_update_info_sets_sql})
 5984                                                        )
 5985                                                        NOT IN ('','.') 
 5986                                                    THEN ';'
 5987                                                    ELSE ''
 5988                                            END
 5989                                            ,
 5990                                            {sql_query_annotation_update_info_sets_sql}
 5991                                            )
 5992                                    {sql_query_annotation_from_clause}
 5993                                    WHERE {sql_query_annotation_where_clause}
 5994                                    ;
 5995                                """
 5996
 5997                            # Add update query to dict
 5998                            query_dict[
 5999                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6000                            ] = sql_query_annotation_chrom_interval_pos
 6001
 6002                        nb_of_query = len(query_dict)
 6003                        num_query = 0
 6004
 6005                        # SET max_expression_depth TO x
 6006                        self.conn.execute("SET max_expression_depth TO 10000")
 6007
 6008                        for query_name in query_dict:
 6009                            query = query_dict[query_name]
 6010                            num_query += 1
 6011                            log.info(
 6012                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6013                            )
 6014                            result = self.conn.execute(query)
 6015                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6016                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6017                            log.info(
 6018                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6019                            )
 6020
 6021                        log.info(
 6022                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6023                        )
 6024
 6025                    else:
 6026
 6027                        log.info(
 6028                            f"Annotation '{annotation_name}' - No Annotations available"
 6029                        )
 6030
 6031                    log.debug("Final header: " + str(vcf_reader.infos))
 6032
 6033        # Remove added columns
 6034        for added_column in added_columns:
 6035            self.drop_column(column=added_column)
 6036
 6037    def annotation_splice(self, threads: int = None) -> None:
 6038        """
 6039        This function annotate with snpEff
 6040
 6041        :param threads: The number of threads to use
 6042        :return: the value of the variable "return_value".
 6043        """
 6044
 6045        # DEBUG
 6046        log.debug("Start annotation with splice tools")
 6047
 6048        # Threads
 6049        if not threads:
 6050            threads = self.get_threads()
 6051        log.debug("Threads: " + str(threads))
 6052
 6053        # DEBUG
 6054        delete_tmp = True
 6055        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6056            delete_tmp = False
 6057            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6058
 6059        # Config
 6060        config = self.get_config()
 6061        log.debug("Config: " + str(config))
 6062        splice_config = config.get("tools", {}).get("splice", {})
 6063        if not splice_config:
 6064            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6065        if not splice_config:
 6066            msg_err = "No Splice tool config"
 6067            log.error(msg_err)
 6068            raise ValueError(msg_err)
 6069        log.debug(f"splice_config={splice_config}")
 6070
 6071        # Config - Folders - Databases
 6072        databases_folders = (
 6073            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6074        )
 6075        log.debug("Databases annotations: " + str(databases_folders))
 6076
 6077        # Splice docker image
 6078        splice_docker_image = splice_config.get("docker").get("image")
 6079
 6080        # Pull splice image if it's not already there
 6081        if not check_docker_image_exists(splice_docker_image):
 6082            log.warning(
 6083                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6084            )
 6085            try:
 6086                command(f"docker pull {splice_config.get('docker').get('image')}")
 6087            except subprocess.CalledProcessError:
 6088                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6089                log.error(msg_err)
 6090                raise ValueError(msg_err)
 6091                return None
 6092
 6093        # Config - splice databases
 6094        splice_databases = (
 6095            config.get("folders", {})
 6096            .get("databases", {})
 6097            .get("splice", DEFAULT_SPLICE_FOLDER)
 6098        )
 6099        splice_databases = full_path(splice_databases)
 6100
 6101        # Param
 6102        param = self.get_param()
 6103        log.debug("Param: " + str(param))
 6104
 6105        # Param
 6106        options = param.get("annotation", {}).get("splice", {})
 6107        log.debug("Options: " + str(options))
 6108
 6109        # Data
 6110        table_variants = self.get_table_variants()
 6111
 6112        # Check if not empty
 6113        log.debug("Check if not empty")
 6114        sql_query_chromosomes = (
 6115            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6116        )
 6117        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6118            log.info("VCF empty")
 6119            return None
 6120
 6121        # Export in VCF
 6122        log.debug("Create initial file to annotate")
 6123
 6124        # Create output folder
 6125        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6126        if not os.path.exists(output_folder):
 6127            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6128
 6129        # Create tmp VCF file
 6130        tmp_vcf = NamedTemporaryFile(
 6131            prefix=self.get_prefix(),
 6132            dir=output_folder,
 6133            suffix=".vcf",
 6134            delete=False,
 6135        )
 6136        tmp_vcf_name = tmp_vcf.name
 6137
 6138        # VCF header
 6139        header = self.get_header()
 6140
 6141        # Existing annotations
 6142        for vcf_annotation in self.get_header().infos:
 6143
 6144            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6145            log.debug(
 6146                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6147            )
 6148
 6149        # Memory limit
 6150        if config.get("memory", None):
 6151            memory_limit = config.get("memory", "8G").upper()
 6152            # upper()
 6153        else:
 6154            memory_limit = "8G"
 6155        log.debug(f"memory_limit: {memory_limit}")
 6156
 6157        # Check number of variants to annotate
 6158        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6159        where_clause_regex_spip = r"SPiP_\w+"
 6160        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6161        df_list_of_variants_to_annotate = self.get_query_to_df(
 6162            query=f""" SELECT * FROM variants {where_clause} """
 6163        )
 6164        if len(df_list_of_variants_to_annotate) == 0:
 6165            log.warning(
 6166                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6167            )
 6168            return None
 6169        else:
 6170            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6171
 6172        # Export VCF file
 6173        self.export_variant_vcf(
 6174            vcf_file=tmp_vcf_name,
 6175            remove_info=True,
 6176            add_samples=True,
 6177            index=False,
 6178            where_clause=where_clause,
 6179        )
 6180
 6181        # Create docker container and launch splice analysis
 6182        if splice_config:
 6183
 6184            # Splice mount folders
 6185            mount_folders = splice_config.get("mount", {})
 6186
 6187            # Genome mount
 6188            mount_folders[
 6189                config.get("folders", {})
 6190                .get("databases", {})
 6191                .get("genomes", DEFAULT_GENOME_FOLDER)
 6192            ] = "ro"
 6193
 6194            # SpliceAI mount
 6195            mount_folders[
 6196                config.get("folders", {})
 6197                .get("databases", {})
 6198                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6199            ] = "ro"
 6200
 6201            # Genome mount
 6202            mount_folders[
 6203                config.get("folders", {})
 6204                .get("databases", {})
 6205                .get("spip", DEFAULT_SPIP_FOLDER)
 6206            ] = "ro"
 6207
 6208            # Mount folders
 6209            mount = []
 6210
 6211            # Config mount
 6212            mount = [
 6213                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6214                for path, mode in mount_folders.items()
 6215            ]
 6216
 6217            if any(value for value in splice_config.values() if value is None):
 6218                log.warning("At least one splice config parameter is empty")
 6219                return None
 6220
 6221            # Params in splice nf
 6222            def check_values(dico: dict):
 6223                """
 6224                Ensure parameters for NF splice pipeline
 6225                """
 6226                for key, val in dico.items():
 6227                    if key == "genome":
 6228                        if any(
 6229                            assemb in options.get("genome", {})
 6230                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6231                        ):
 6232                            yield f"--{key} hg19"
 6233                        elif any(
 6234                            assemb in options.get("genome", {})
 6235                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6236                        ):
 6237                            yield f"--{key} hg38"
 6238                    elif (
 6239                        (isinstance(val, str) and val)
 6240                        or isinstance(val, int)
 6241                        or isinstance(val, bool)
 6242                    ):
 6243                        yield f"--{key} {val}"
 6244
 6245            # Genome
 6246            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6247            options["genome"] = genome
 6248
 6249            # NF params
 6250            nf_params = []
 6251
 6252            # Add options
 6253            if options:
 6254                nf_params = list(check_values(options))
 6255                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6256            else:
 6257                log.debug("No NF params provided")
 6258
 6259            # Add threads
 6260            if "threads" not in options.keys():
 6261                nf_params.append(f"--threads {threads}")
 6262
 6263            # Genome path
 6264            genome_path = find_genome(
 6265                config.get("folders", {})
 6266                .get("databases", {})
 6267                .get("genomes", DEFAULT_GENOME_FOLDER),
 6268                file=f"{genome}.fa",
 6269            )
 6270            # Add genome path
 6271            if not genome_path:
 6272                raise ValueError(
 6273                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6274                )
 6275            else:
 6276                log.debug(f"Genome: {genome_path}")
 6277                nf_params.append(f"--genome_path {genome_path}")
 6278
 6279            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6280                """
 6281                Setting up updated databases for SPiP and SpliceAI
 6282                """
 6283
 6284                try:
 6285
 6286                    # SpliceAI assembly transcriptome
 6287                    spliceai_assembly = os.path.join(
 6288                        config.get("folders", {})
 6289                        .get("databases", {})
 6290                        .get("spliceai", {}),
 6291                        options.get("genome"),
 6292                        "transcriptome",
 6293                    )
 6294                    spip_assembly = options.get("genome")
 6295
 6296                    spip = find(
 6297                        f"transcriptome_{spip_assembly}.RData",
 6298                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6299                    )
 6300                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6301                    log.debug(f"SPiP annotations: {spip}")
 6302                    log.debug(f"SpliceAI annotations: {spliceai}")
 6303                    if spip and spliceai:
 6304                        return [
 6305                            f"--spip_transcriptome {spip}",
 6306                            f"--spliceai_annotations {spliceai}",
 6307                        ]
 6308                    else:
 6309                        # TODO crash and go on with basic annotations ?
 6310                        # raise ValueError(
 6311                        #     "Can't find splice databases in configuration EXIT"
 6312                        # )
 6313                        log.warning(
 6314                            "Can't find splice databases in configuration, use annotations file from image"
 6315                        )
 6316                except TypeError:
 6317                    log.warning(
 6318                        "Can't find splice databases in configuration, use annotations file from image"
 6319                    )
 6320                    return []
 6321
 6322            # Add options, check if transcriptome option have already beend provided
 6323            if (
 6324                "spip_transcriptome" not in nf_params
 6325                and "spliceai_transcriptome" not in nf_params
 6326            ):
 6327                splice_reference = splice_annotations(options, config)
 6328                if splice_reference:
 6329                    nf_params.extend(splice_reference)
 6330
 6331            nf_params.append(f"--output_folder {output_folder}")
 6332
 6333            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6334            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6335            log.debug(cmd)
 6336
 6337            splice_config["docker"]["command"] = cmd
 6338
 6339            docker_cmd = get_bin_command(
 6340                tool="splice",
 6341                bin_type="docker",
 6342                config=config,
 6343                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6344                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6345            )
 6346
 6347            # Docker debug
 6348            # if splice_config.get("rm_container"):
 6349            #     rm_container = "--rm"
 6350            # else:
 6351            #     rm_container = ""
 6352            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6353
 6354            log.debug(docker_cmd)
 6355            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6356            log.debug(res.stdout)
 6357            if res.stderr:
 6358                log.error(res.stderr)
 6359            res.check_returncode()
 6360        else:
 6361            log.warning(f"Splice tool configuration not found: {config}")
 6362
 6363        # Update variants
 6364        log.info("Annotation - Updating...")
 6365        # Test find output vcf
 6366        log.debug(
 6367            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6368        )
 6369        output_vcf = []
 6370        # Wrong folder to look in
 6371        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6372            if (
 6373                files
 6374                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6375            ):
 6376                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6377        # log.debug(os.listdir(options.get("output_folder")))
 6378        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6379        if not output_vcf:
 6380            log.debug(
 6381                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6382            )
 6383        else:
 6384            # Get new header from annotated vcf
 6385            log.debug(f"Initial header: {len(header.infos)} fields")
 6386            # Create new header with splice infos
 6387            new_vcf = Variants(input=output_vcf[0])
 6388            new_vcf_header = new_vcf.get_header().infos
 6389            for keys, infos in new_vcf_header.items():
 6390                if keys not in header.infos.keys():
 6391                    header.infos[keys] = infos
 6392            log.debug(f"New header: {len(header.infos)} fields")
 6393            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6394            self.update_from_vcf(output_vcf[0])
 6395
 6396        # Remove folder
 6397        remove_if_exists(output_folder)
 6398
 6399    ###
 6400    # Prioritization
 6401    ###
 6402
 6403    def get_config_default(self, name: str) -> dict:
 6404        """
 6405        The function `get_config_default` returns a dictionary containing default configurations for
 6406        various calculations and prioritizations.
 6407
 6408        :param name: The `get_config_default` function returns a dictionary containing default
 6409        configurations for different calculations and prioritizations. The `name` parameter is used to
 6410        specify which specific configuration to retrieve from the dictionary
 6411        :type name: str
 6412        :return: The function `get_config_default` returns a dictionary containing default configuration
 6413        settings for different calculations and prioritizations. The specific configuration settings are
 6414        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6415        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6416        returned. If there is no match, an empty dictionary is returned.
 6417        """
 6418
 6419        config_default = {
 6420            "calculations": {
 6421                "variant_chr_pos_alt_ref": {
 6422                    "type": "sql",
 6423                    "name": "variant_chr_pos_alt_ref",
 6424                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6425                    "available": False,
 6426                    "output_column_name": "variant_chr_pos_alt_ref",
 6427                    "output_column_type": "String",
 6428                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6429                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6430                    "operation_info": True,
 6431                },
 6432                "VARTYPE": {
 6433                    "type": "sql",
 6434                    "name": "VARTYPE",
 6435                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6436                    "available": True,
 6437                    "output_column_name": "VARTYPE",
 6438                    "output_column_type": "String",
 6439                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6440                    "operation_query": """
 6441                            CASE
 6442                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6443                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6444                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6445                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6446                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6447                                ELSE 'UNDEFINED'
 6448                            END
 6449                            """,
 6450                    "info_fields": ["SVTYPE"],
 6451                    "operation_info": True,
 6452                },
 6453                "snpeff_hgvs": {
 6454                    "type": "python",
 6455                    "name": "snpeff_hgvs",
 6456                    "description": "HGVS nomenclatures from snpEff annotation",
 6457                    "available": True,
 6458                    "function_name": "calculation_extract_snpeff_hgvs",
 6459                    "function_params": ["snpeff_hgvs", "ANN"],
 6460                },
 6461                "snpeff_ann_explode": {
 6462                    "type": "python",
 6463                    "name": "snpeff_ann_explode",
 6464                    "description": "Explode snpEff annotations with uniquify values",
 6465                    "available": True,
 6466                    "function_name": "calculation_snpeff_ann_explode",
 6467                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6468                },
 6469                "snpeff_ann_explode_uniquify": {
 6470                    "type": "python",
 6471                    "name": "snpeff_ann_explode_uniquify",
 6472                    "description": "Explode snpEff annotations",
 6473                    "available": True,
 6474                    "function_name": "calculation_snpeff_ann_explode",
 6475                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6476                },
 6477                "snpeff_ann_explode_json": {
 6478                    "type": "python",
 6479                    "name": "snpeff_ann_explode_json",
 6480                    "description": "Explode snpEff annotations in JSON format",
 6481                    "available": True,
 6482                    "function_name": "calculation_snpeff_ann_explode",
 6483                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6484                },
 6485                "NOMEN": {
 6486                    "type": "python",
 6487                    "name": "NOMEN",
 6488                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6489                    "available": True,
 6490                    "function_name": "calculation_extract_nomen",
 6491                    "function_params": [],
 6492                },
 6493                "FINDBYPIPELINE": {
 6494                    "type": "python",
 6495                    "name": "FINDBYPIPELINE",
 6496                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6497                    "available": True,
 6498                    "function_name": "calculation_find_by_pipeline",
 6499                    "function_params": ["findbypipeline"],
 6500                },
 6501                "FINDBYSAMPLE": {
 6502                    "type": "python",
 6503                    "name": "FINDBYSAMPLE",
 6504                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6505                    "available": True,
 6506                    "function_name": "calculation_find_by_pipeline",
 6507                    "function_params": ["findbysample"],
 6508                },
 6509                "GENOTYPECONCORDANCE": {
 6510                    "type": "python",
 6511                    "name": "GENOTYPECONCORDANCE",
 6512                    "description": "Concordance of genotype for multi caller VCF",
 6513                    "available": True,
 6514                    "function_name": "calculation_genotype_concordance",
 6515                    "function_params": [],
 6516                },
 6517                "BARCODE": {
 6518                    "type": "python",
 6519                    "name": "BARCODE",
 6520                    "description": "BARCODE as VaRank tool",
 6521                    "available": True,
 6522                    "function_name": "calculation_barcode",
 6523                    "function_params": [],
 6524                },
 6525                "BARCODEFAMILY": {
 6526                    "type": "python",
 6527                    "name": "BARCODEFAMILY",
 6528                    "description": "BARCODEFAMILY as VaRank tool",
 6529                    "available": True,
 6530                    "function_name": "calculation_barcode_family",
 6531                    "function_params": ["BCF"],
 6532                },
 6533                "TRIO": {
 6534                    "type": "python",
 6535                    "name": "TRIO",
 6536                    "description": "Inheritance for a trio family",
 6537                    "available": True,
 6538                    "function_name": "calculation_trio",
 6539                    "function_params": [],
 6540                },
 6541                "VAF": {
 6542                    "type": "python",
 6543                    "name": "VAF",
 6544                    "description": "Variant Allele Frequency (VAF) harmonization",
 6545                    "available": True,
 6546                    "function_name": "calculation_vaf_normalization",
 6547                    "function_params": [],
 6548                },
 6549                "VAF_stats": {
 6550                    "type": "python",
 6551                    "name": "VAF_stats",
 6552                    "description": "Variant Allele Frequency (VAF) statistics",
 6553                    "available": True,
 6554                    "function_name": "calculation_genotype_stats",
 6555                    "function_params": ["VAF"],
 6556                },
 6557                "DP_stats": {
 6558                    "type": "python",
 6559                    "name": "DP_stats",
 6560                    "description": "Depth (DP) statistics",
 6561                    "available": True,
 6562                    "function_name": "calculation_genotype_stats",
 6563                    "function_params": ["DP"],
 6564                },
 6565                "variant_id": {
 6566                    "type": "python",
 6567                    "name": "variant_id",
 6568                    "description": "Variant ID generated from variant position and type",
 6569                    "available": True,
 6570                    "function_name": "calculation_variant_id",
 6571                    "function_params": [],
 6572                },
 6573                "transcripts_json": {
 6574                    "type": "python",
 6575                    "name": "transcripts_json",
 6576                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6577                    "available": True,
 6578                    "function_name": "calculation_transcripts_annotation",
 6579                    "function_params": ["transcripts_json", None],
 6580                },
 6581                "transcripts_ann": {
 6582                    "type": "python",
 6583                    "name": "transcripts_ann",
 6584                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6585                    "available": True,
 6586                    "function_name": "calculation_transcripts_annotation",
 6587                    "function_params": [None, "transcripts_ann"],
 6588                },
 6589                "transcripts_annotations": {
 6590                    "type": "python",
 6591                    "name": "transcripts_annotations",
 6592                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6593                    "available": True,
 6594                    "function_name": "calculation_transcripts_annotation",
 6595                    "function_params": [None, None],
 6596                },
 6597                "transcripts_prioritization": {
 6598                    "type": "python",
 6599                    "name": "transcripts_prioritization",
 6600                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6601                    "available": True,
 6602                    "function_name": "calculation_transcripts_prioritization",
 6603                    "function_params": [],
 6604                },
 6605            },
 6606            "prioritizations": {
 6607                "default": {
 6608                    "ANN2": [
 6609                        {
 6610                            "type": "contains",
 6611                            "value": "HIGH",
 6612                            "score": 5,
 6613                            "flag": "PASS",
 6614                            "comment": [
 6615                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6616                            ],
 6617                        },
 6618                        {
 6619                            "type": "contains",
 6620                            "value": "MODERATE",
 6621                            "score": 3,
 6622                            "flag": "PASS",
 6623                            "comment": [
 6624                                "A non-disruptive variant that might change protein effectiveness"
 6625                            ],
 6626                        },
 6627                        {
 6628                            "type": "contains",
 6629                            "value": "LOW",
 6630                            "score": 0,
 6631                            "flag": "FILTERED",
 6632                            "comment": [
 6633                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6634                            ],
 6635                        },
 6636                        {
 6637                            "type": "contains",
 6638                            "value": "MODIFIER",
 6639                            "score": 0,
 6640                            "flag": "FILTERED",
 6641                            "comment": [
 6642                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6643                            ],
 6644                        },
 6645                    ],
 6646                }
 6647            },
 6648        }
 6649
 6650        return config_default.get(name, None)
 6651
 6652    def get_config_json(
 6653        self, name: str, config_dict: dict = {}, config_file: str = None
 6654    ) -> dict:
 6655        """
 6656        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6657        default values, a dictionary, and a file.
 6658
 6659        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6660        the name of the configuration. It is used to identify and retrieve the configuration settings
 6661        for a specific component or module
 6662        :type name: str
 6663        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6664        dictionary that allows you to provide additional configuration settings or overrides. When you
 6665        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6666        the key is the configuration setting you want to override or
 6667        :type config_dict: dict
 6668        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6669        specify the path to a configuration file that contains additional settings. If provided, the
 6670        function will read the contents of this file and update the configuration dictionary with the
 6671        values found in the file, overriding any existing values with the
 6672        :type config_file: str
 6673        :return: The function `get_config_json` returns a dictionary containing the configuration
 6674        settings.
 6675        """
 6676
 6677        # Create with default prioritizations
 6678        config_default = self.get_config_default(name=name)
 6679        configuration = config_default
 6680        # log.debug(f"configuration={configuration}")
 6681
 6682        # Replace prioritizations from dict
 6683        for config in config_dict:
 6684            configuration[config] = config_dict[config]
 6685
 6686        # Replace prioritizations from file
 6687        config_file = full_path(config_file)
 6688        if config_file:
 6689            if os.path.exists(config_file):
 6690                with open(config_file) as config_file_content:
 6691                    config_file_dict = json.load(config_file_content)
 6692                for config in config_file_dict:
 6693                    configuration[config] = config_file_dict[config]
 6694            else:
 6695                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6696                log.error(msg_error)
 6697                raise ValueError(msg_error)
 6698
 6699        return configuration
 6700
 6701    def prioritization(
 6702        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6703    ) -> bool:
 6704        """
 6705        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6706        prioritizes variants based on configured profiles and criteria.
 6707
 6708        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6709        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6710        a table name is provided, the method will prioritize the variants in that specific table
 6711        :type table: str
 6712        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6713        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6714        provided, the code will use a default prefix value of "PZ"
 6715        :type pz_prefix: str
 6716        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6717        additional parameters specific to the prioritization process. These parameters can include
 6718        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6719        configurations needed for the prioritization of variants in a V
 6720        :type pz_param: dict
 6721        :return: A boolean value (True) is being returned from the `prioritization` function.
 6722        """
 6723
 6724        # Config
 6725        config = self.get_config()
 6726
 6727        # Param
 6728        param = self.get_param()
 6729
 6730        # Prioritization param
 6731        if pz_param is not None:
 6732            prioritization_param = pz_param
 6733        else:
 6734            prioritization_param = param.get("prioritization", {})
 6735
 6736        # Configuration profiles
 6737        prioritization_config_file = prioritization_param.get(
 6738            "prioritization_config", None
 6739        )
 6740        prioritization_config_file = full_path(prioritization_config_file)
 6741        prioritizations_config = self.get_config_json(
 6742            name="prioritizations", config_file=prioritization_config_file
 6743        )
 6744
 6745        # Prioritization prefix
 6746        pz_prefix_default = "PZ"
 6747        if pz_prefix is None:
 6748            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6749
 6750        # Prioritization options
 6751        profiles = prioritization_param.get("profiles", [])
 6752        if isinstance(profiles, str):
 6753            profiles = profiles.split(",")
 6754        pzfields = prioritization_param.get(
 6755            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6756        )
 6757        if isinstance(pzfields, str):
 6758            pzfields = pzfields.split(",")
 6759        default_profile = prioritization_param.get("default_profile", None)
 6760        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6761        prioritization_score_mode = prioritization_param.get(
 6762            "prioritization_score_mode", "HOWARD"
 6763        )
 6764
 6765        # Quick Prioritizations
 6766        prioritizations = param.get("prioritizations", None)
 6767        if prioritizations:
 6768            log.info("Quick Prioritization:")
 6769            for profile in prioritizations.split(","):
 6770                if profile not in profiles:
 6771                    profiles.append(profile)
 6772                    log.info(f"   {profile}")
 6773
 6774        # If profile "ALL" provided, all profiles in the config profiles
 6775        if "ALL" in profiles:
 6776            profiles = list(prioritizations_config.keys())
 6777
 6778        for profile in profiles:
 6779            if prioritizations_config.get(profile, None):
 6780                log.debug(f"Profile '{profile}' configured")
 6781            else:
 6782                msg_error = f"Profile '{profile}' NOT configured"
 6783                log.error(msg_error)
 6784                raise ValueError(msg_error)
 6785
 6786        if profiles:
 6787            log.info(f"Prioritization... ")
 6788        else:
 6789            log.debug(f"No profile defined")
 6790            return False
 6791
 6792        if not default_profile and len(profiles):
 6793            default_profile = profiles[0]
 6794
 6795        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6796        log.debug("Profiles to check: " + str(list(profiles)))
 6797
 6798        # Variables
 6799        if table is not None:
 6800            table_variants = table
 6801        else:
 6802            table_variants = self.get_table_variants(clause="update")
 6803        log.debug(f"Table to prioritize: {table_variants}")
 6804
 6805        # Added columns
 6806        added_columns = []
 6807
 6808        # Create list of PZfields
 6809        # List of PZFields
 6810        list_of_pzfields_original = pzfields + [
 6811            pzfield + pzfields_sep + profile
 6812            for pzfield in pzfields
 6813            for profile in profiles
 6814        ]
 6815        list_of_pzfields = []
 6816        log.debug(f"{list_of_pzfields_original}")
 6817
 6818        # Remove existing PZfields to use if exists
 6819        for pzfield in list_of_pzfields_original:
 6820            if self.get_header().infos.get(pzfield, None) is None:
 6821                list_of_pzfields.append(pzfield)
 6822                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6823            else:
 6824                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6825
 6826        if list_of_pzfields:
 6827
 6828            # Explode Infos prefix
 6829            explode_infos_prefix = self.get_explode_infos_prefix()
 6830
 6831            # PZfields tags description
 6832            PZfields_INFOS = {
 6833                f"{pz_prefix}Tags": {
 6834                    "ID": f"{pz_prefix}Tags",
 6835                    "Number": ".",
 6836                    "Type": "String",
 6837                    "Description": "Variant tags based on annotation criteria",
 6838                },
 6839                f"{pz_prefix}Score": {
 6840                    "ID": f"{pz_prefix}Score",
 6841                    "Number": 1,
 6842                    "Type": "Integer",
 6843                    "Description": "Variant score based on annotation criteria",
 6844                },
 6845                f"{pz_prefix}Flag": {
 6846                    "ID": f"{pz_prefix}Flag",
 6847                    "Number": 1,
 6848                    "Type": "String",
 6849                    "Description": "Variant flag based on annotation criteria",
 6850                },
 6851                f"{pz_prefix}Comment": {
 6852                    "ID": f"{pz_prefix}Comment",
 6853                    "Number": ".",
 6854                    "Type": "String",
 6855                    "Description": "Variant comment based on annotation criteria",
 6856                },
 6857                f"{pz_prefix}Infos": {
 6858                    "ID": f"{pz_prefix}Infos",
 6859                    "Number": ".",
 6860                    "Type": "String",
 6861                    "Description": "Variant infos based on annotation criteria",
 6862                },
 6863                f"{pz_prefix}Class": {
 6864                    "ID": f"{pz_prefix}Class",
 6865                    "Number": ".",
 6866                    "Type": "String",
 6867                    "Description": "Variant class based on annotation criteria",
 6868                },
 6869            }
 6870
 6871            # Create INFO fields if not exist
 6872            for field in PZfields_INFOS:
 6873                field_ID = PZfields_INFOS[field]["ID"]
 6874                field_description = PZfields_INFOS[field]["Description"]
 6875                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6876                    field_description = (
 6877                        PZfields_INFOS[field]["Description"]
 6878                        + f", profile {default_profile}"
 6879                    )
 6880                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6881                        field_ID,
 6882                        PZfields_INFOS[field]["Number"],
 6883                        PZfields_INFOS[field]["Type"],
 6884                        field_description,
 6885                        "unknown",
 6886                        "unknown",
 6887                        code_type_map[PZfields_INFOS[field]["Type"]],
 6888                    )
 6889
 6890            # Create INFO fields if not exist for each profile
 6891            for profile in prioritizations_config:
 6892                if profile in profiles or profiles == []:
 6893                    for field in PZfields_INFOS:
 6894                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6895                        field_description = (
 6896                            PZfields_INFOS[field]["Description"]
 6897                            + f", profile {profile}"
 6898                        )
 6899                        if (
 6900                            field_ID not in self.get_header().infos
 6901                            and field in pzfields
 6902                        ):
 6903                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6904                                field_ID,
 6905                                PZfields_INFOS[field]["Number"],
 6906                                PZfields_INFOS[field]["Type"],
 6907                                field_description,
 6908                                "unknown",
 6909                                "unknown",
 6910                                code_type_map[PZfields_INFOS[field]["Type"]],
 6911                            )
 6912
 6913            # Header
 6914            for pzfield in list_of_pzfields:
 6915                if re.match(f"{pz_prefix}Score.*", pzfield):
 6916                    added_column = self.add_column(
 6917                        table_name=table_variants,
 6918                        column_name=pzfield,
 6919                        column_type="INTEGER",
 6920                        default_value="0",
 6921                    )
 6922                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6923                    added_column = self.add_column(
 6924                        table_name=table_variants,
 6925                        column_name=pzfield,
 6926                        column_type="BOOLEAN",
 6927                        default_value="1",
 6928                    )
 6929                elif re.match(f"{pz_prefix}Class.*", pzfield):
 6930                    added_column = self.add_column(
 6931                        table_name=table_variants,
 6932                        column_name=pzfield,
 6933                        column_type="VARCHAR[]",
 6934                        default_value="null",
 6935                    )
 6936                else:
 6937                    added_column = self.add_column(
 6938                        table_name=table_variants,
 6939                        column_name=pzfield,
 6940                        column_type="STRING",
 6941                        default_value="''",
 6942                    )
 6943                added_columns.append(added_column)
 6944
 6945            # Profiles
 6946            if profiles:
 6947
 6948                # foreach profile in configuration file
 6949                for profile in prioritizations_config:
 6950
 6951                    # If profile is asked in param, or ALL are asked (empty profile [])
 6952                    if profile in profiles or profiles == []:
 6953                        log.info(f"Profile '{profile}'")
 6954
 6955                        sql_set_info_option = ""
 6956
 6957                        sql_set_info = []
 6958
 6959                        # PZ fields set
 6960
 6961                        # PZScore
 6962                        if (
 6963                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6964                            in list_of_pzfields
 6965                        ):
 6966                            sql_set_info.append(
 6967                                f"""
 6968                                    concat(
 6969                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6970                                        {pz_prefix}Score{pzfields_sep}{profile}
 6971                                    ) 
 6972                                """
 6973                            )
 6974                            if (
 6975                                profile == default_profile
 6976                                and f"{pz_prefix}Score" in list_of_pzfields
 6977                            ):
 6978                                sql_set_info.append(
 6979                                    f"""
 6980                                        concat(
 6981                                            '{pz_prefix}Score=',
 6982                                            {pz_prefix}Score{pzfields_sep}{profile}
 6983                                        )
 6984                                    """
 6985                                )
 6986
 6987                        # PZFlag
 6988                        if (
 6989                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6990                            in list_of_pzfields
 6991                        ):
 6992                            sql_set_info.append(
 6993                                f"""
 6994                                    concat(
 6995                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6996                                        CASE 
 6997                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6998                                            THEN 'PASS'
 6999                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7000                                            THEN 'FILTERED'
 7001                                        END
 7002                                    ) 
 7003                                """
 7004                            )
 7005                            if (
 7006                                profile == default_profile
 7007                                and f"{pz_prefix}Flag" in list_of_pzfields
 7008                            ):
 7009                                sql_set_info.append(
 7010                                    f"""
 7011                                        concat(
 7012                                            '{pz_prefix}Flag=',
 7013                                            CASE 
 7014                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7015                                                THEN 'PASS'
 7016                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7017                                                THEN 'FILTERED'
 7018                                            END
 7019                                        )
 7020                                    """
 7021                                )
 7022
 7023                        # PZClass
 7024                        if (
 7025                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7026                            in list_of_pzfields
 7027                        ):
 7028                            sql_set_info.append(
 7029                                f"""
 7030                                    concat(
 7031                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7032                                        CASE
 7033                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7034                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7035                                            ELSE '.'
 7036                                        END 
 7037                                    )
 7038                                    
 7039                                """
 7040                            )
 7041                            if (
 7042                                profile == default_profile
 7043                                and f"{pz_prefix}Class" in list_of_pzfields
 7044                            ):
 7045                                sql_set_info.append(
 7046                                    f"""
 7047                                        concat(
 7048                                            '{pz_prefix}Class=',
 7049                                            CASE
 7050                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7051                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7052                                                ELSE '.'
 7053                                            END 
 7054                                        )
 7055                                    """
 7056                                )
 7057
 7058                        # PZComment
 7059                        if (
 7060                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7061                            in list_of_pzfields
 7062                        ):
 7063                            sql_set_info.append(
 7064                                f"""
 7065                                    CASE
 7066                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7067                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7068                                        ELSE ''
 7069                                    END
 7070                                """
 7071                            )
 7072                            if (
 7073                                profile == default_profile
 7074                                and f"{pz_prefix}Comment" in list_of_pzfields
 7075                            ):
 7076                                sql_set_info.append(
 7077                                    f"""
 7078                                        CASE
 7079                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7080                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7081                                            ELSE ''
 7082                                        END
 7083                                    """
 7084                                )
 7085
 7086                        # PZInfos
 7087                        if (
 7088                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7089                            in list_of_pzfields
 7090                        ):
 7091                            sql_set_info.append(
 7092                                f"""
 7093                                    CASE
 7094                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7095                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7096                                        ELSE ''
 7097                                    END
 7098                                """
 7099                            )
 7100                            if (
 7101                                profile == default_profile
 7102                                and f"{pz_prefix}Infos" in list_of_pzfields
 7103                            ):
 7104                                sql_set_info.append(
 7105                                    f"""
 7106                                        CASE
 7107                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7108                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7109                                            ELSE ''
 7110                                        END
 7111                                    """
 7112                                )
 7113
 7114                        # Merge PZfields
 7115                        sql_set_info_option = ""
 7116                        sql_set_sep = ""
 7117                        for sql_set in sql_set_info:
 7118                            if sql_set_sep:
 7119                                sql_set_info_option += f"""
 7120                                    , concat('{sql_set_sep}', {sql_set})
 7121                                """
 7122                            else:
 7123                                sql_set_info_option += f"""
 7124                                    , {sql_set}
 7125                                """
 7126                            sql_set_sep = ";"
 7127
 7128                        sql_queries = []
 7129                        for annotation in prioritizations_config[profile]:
 7130
 7131                            # skip special sections
 7132                            if annotation.startswith("_"):
 7133                                continue
 7134
 7135                            # For each criterions
 7136                            for criterion in prioritizations_config[profile][
 7137                                annotation
 7138                            ]:
 7139
 7140                                # Criterion mode
 7141                                criterion_mode = None
 7142                                if np.any(
 7143                                    np.isin(list(criterion.keys()), ["type", "value"])
 7144                                ):
 7145                                    criterion_mode = "operation"
 7146                                elif np.any(
 7147                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7148                                ):
 7149                                    criterion_mode = "sql"
 7150                                log.debug(f"Criterion Mode: {criterion_mode}")
 7151
 7152                                # Criterion parameters
 7153                                criterion_type = criterion.get("type", None)
 7154                                criterion_value = criterion.get("value", None)
 7155                                criterion_sql = criterion.get("sql", None)
 7156                                criterion_fields = criterion.get("fields", None)
 7157                                criterion_score = criterion.get("score", 0)
 7158                                criterion_flag = criterion.get("flag", "PASS")
 7159                                criterion_class = criterion.get("class", None)
 7160                                criterion_flag_bool = criterion_flag == "PASS"
 7161                                criterion_comment = (
 7162                                    ", ".join(criterion.get("comment", []))
 7163                                    .replace("'", "''")
 7164                                    .replace(";", ",")
 7165                                    .replace("\t", " ")
 7166                                )
 7167                                criterion_infos = (
 7168                                    str(criterion)
 7169                                    .replace("'", "''")
 7170                                    .replace(";", ",")
 7171                                    .replace("\t", " ")
 7172                                )
 7173
 7174                                # SQL
 7175                                if criterion_sql is not None and isinstance(
 7176                                    criterion_sql, list
 7177                                ):
 7178                                    criterion_sql = " ".join(criterion_sql)
 7179
 7180                                # Fields and explode
 7181                                if criterion_fields is None:
 7182                                    criterion_fields = [annotation]
 7183                                if not isinstance(criterion_fields, list):
 7184                                    criterion_fields = str(criterion_fields).split(",")
 7185
 7186                                # Class
 7187                                if criterion_class is not None and not isinstance(
 7188                                    criterion_class, list
 7189                                ):
 7190                                    criterion_class = str(criterion_class).split(",")
 7191
 7192                                for annotation_field in criterion_fields:
 7193
 7194                                    # Explode specific annotation
 7195                                    log.debug(
 7196                                        f"Explode annotation '{annotation_field}'"
 7197                                    )
 7198                                    added_columns += self.explode_infos(
 7199                                        prefix=explode_infos_prefix,
 7200                                        fields=[annotation_field],
 7201                                        table=table_variants,
 7202                                    )
 7203                                    extra_infos = self.get_extra_infos(
 7204                                        table=table_variants
 7205                                    )
 7206
 7207                                    # Check if annotation field is present
 7208                                    if (
 7209                                        f"{explode_infos_prefix}{annotation_field}"
 7210                                        not in extra_infos
 7211                                    ):
 7212                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7213                                        log.error(msq_err)
 7214                                        raise ValueError(msq_err)
 7215                                    else:
 7216                                        log.debug(
 7217                                            f"Annotation '{annotation_field}' in data"
 7218                                        )
 7219
 7220                                sql_set = []
 7221                                sql_set_info = []
 7222
 7223                                # PZ fields set
 7224
 7225                                # PZScore
 7226                                if (
 7227                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7228                                    in list_of_pzfields
 7229                                ):
 7230                                    # if prioritization_score_mode == "HOWARD":
 7231                                    #     sql_set.append(
 7232                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7233                                    #     )
 7234                                    # VaRank prioritization score mode
 7235                                    if prioritization_score_mode == "VaRank":
 7236                                        sql_set.append(
 7237                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7238                                        )
 7239                                    # default HOWARD prioritization score mode
 7240                                    else:
 7241                                        sql_set.append(
 7242                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7243                                        )
 7244
 7245                                # PZFlag
 7246                                if (
 7247                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7248                                    in list_of_pzfields
 7249                                ):
 7250                                    sql_set.append(
 7251                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7252                                    )
 7253
 7254                                # PZClass
 7255                                if (
 7256                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7257                                    in list_of_pzfields
 7258                                    and criterion_class is not None
 7259                                ):
 7260                                    sql_set.append(
 7261                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7262                                    )
 7263
 7264                                # PZComment
 7265                                if (
 7266                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7267                                    in list_of_pzfields
 7268                                ):
 7269                                    sql_set.append(
 7270                                        f"""
 7271                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7272                                                concat(
 7273                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7274                                                    CASE 
 7275                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7276                                                        THEN ', '
 7277                                                        ELSE ''
 7278                                                    END,
 7279                                                    '{criterion_comment}'
 7280                                                )
 7281                                        """
 7282                                    )
 7283
 7284                                # PZInfos
 7285                                if (
 7286                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7287                                    in list_of_pzfields
 7288                                ):
 7289                                    sql_set.append(
 7290                                        f"""
 7291                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7292                                                concat(
 7293                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7294                                                    '{criterion_infos}'
 7295                                                )
 7296                                        """
 7297                                    )
 7298                                sql_set_option = ",".join(sql_set)
 7299
 7300                                # Criterion and comparison
 7301                                if sql_set_option:
 7302
 7303                                    if criterion_mode in ["operation"]:
 7304
 7305                                        try:
 7306                                            float(criterion_value)
 7307                                            sql_update = f"""
 7308                                                UPDATE {table_variants}
 7309                                                SET {sql_set_option}
 7310                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7311                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7312                                            """
 7313                                        except:
 7314                                            contains_option = ""
 7315                                            if criterion_type == "contains":
 7316                                                contains_option = ".*"
 7317                                            sql_update = f"""
 7318                                                UPDATE {table_variants}
 7319                                                SET {sql_set_option}
 7320                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7321                                            """
 7322                                        sql_queries.append(sql_update)
 7323
 7324                                    elif criterion_mode in ["sql"]:
 7325
 7326                                        sql_update = f"""
 7327                                            UPDATE {table_variants}
 7328                                            SET {sql_set_option}
 7329                                            WHERE {criterion_sql}
 7330                                        """
 7331                                        sql_queries.append(sql_update)
 7332
 7333                                    else:
 7334                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7335                                        log.error(msg_err)
 7336                                        raise ValueError(msg_err)
 7337
 7338                                else:
 7339                                    log.warning(
 7340                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7341                                    )
 7342
 7343                        # PZTags
 7344                        if (
 7345                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7346                            in list_of_pzfields
 7347                        ):
 7348
 7349                            # Create PZFalgs value
 7350                            pztags_value = ""
 7351                            pztags_sep_default = ","
 7352                            pztags_sep = ""
 7353                            for pzfield in pzfields:
 7354                                if pzfield not in [f"{pz_prefix}Tags"]:
 7355                                    if (
 7356                                        f"{pzfield}{pzfields_sep}{profile}"
 7357                                        in list_of_pzfields
 7358                                    ):
 7359                                        if pzfield in [f"{pz_prefix}Flag"]:
 7360                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7361                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7362                                                    THEN 'PASS'
 7363                                                    ELSE 'FILTERED'
 7364                                                END, '"""
 7365                                        elif pzfield in [f"{pz_prefix}Class"]:
 7366                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7367                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7368                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7369                                                    ELSE '.'
 7370                                                END, '"""
 7371                                        else:
 7372                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7373                                        pztags_sep = pztags_sep_default
 7374
 7375                            # Add Query update for PZFlags
 7376                            sql_update_pztags = f"""
 7377                                UPDATE {table_variants}
 7378                                SET INFO = concat(
 7379                                        INFO,
 7380                                        CASE WHEN INFO NOT in ('','.')
 7381                                                THEN ';'
 7382                                                ELSE ''
 7383                                        END,
 7384                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7385                                    )
 7386                                """
 7387                            sql_queries.append(sql_update_pztags)
 7388
 7389                            # Add Query update for PZFlags for default
 7390                            if profile == default_profile:
 7391                                sql_update_pztags_default = f"""
 7392                                UPDATE {table_variants}
 7393                                SET INFO = concat(
 7394                                        INFO,
 7395                                        ';',
 7396                                        '{pz_prefix}Tags={pztags_value}'
 7397                                    )
 7398                                """
 7399                                sql_queries.append(sql_update_pztags_default)
 7400
 7401                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7402
 7403                        if sql_queries:
 7404
 7405                            for sql_query in sql_queries:
 7406                                log.debug(
 7407                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7408                                )
 7409                                self.conn.execute(sql_query)
 7410
 7411                        log.info(f"""Profile '{profile}' - Update... """)
 7412                        sql_query_update = f"""
 7413                            UPDATE {table_variants}
 7414                            SET INFO =  
 7415                                concat(
 7416                                    CASE
 7417                                        WHEN INFO NOT IN ('','.')
 7418                                        THEN concat(INFO, ';')
 7419                                        ELSE ''
 7420                                    END
 7421                                    {sql_set_info_option}
 7422                                )
 7423                        """
 7424                        self.conn.execute(sql_query_update)
 7425
 7426        else:
 7427
 7428            log.warning(f"No profiles in parameters")
 7429
 7430        # Remove added columns
 7431        for added_column in added_columns:
 7432            self.drop_column(column=added_column)
 7433
 7434        # Explode INFOS fields into table fields
 7435        if self.get_explode_infos():
 7436            self.explode_infos(
 7437                prefix=self.get_explode_infos_prefix(),
 7438                fields=self.get_explode_infos_fields(),
 7439                force=True,
 7440            )
 7441
 7442        return True
 7443
 7444    ###
 7445    # HGVS
 7446    ###
 7447
 7448    def annotation_hgvs(self, threads: int = None) -> None:
 7449        """
 7450        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7451        coordinates and alleles.
 7452
 7453        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7454        threads to use for parallel processing. If no value is provided, it will default to the number
 7455        of threads obtained from the `get_threads()` method
 7456        :type threads: int
 7457        """
 7458
 7459        # Function for each partition of the Dask Dataframe
 7460        def partition_function(partition):
 7461            """
 7462            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7463            each row of a DataFrame called `partition`.
 7464
 7465            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7466            to be processed
 7467            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7468            the "partition" dataframe along the axis 1.
 7469            """
 7470            return partition.apply(annotation_hgvs_partition, axis=1)
 7471
 7472        def annotation_hgvs_partition(row) -> str:
 7473            """
 7474            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7475            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7476
 7477            :param row: A dictionary-like object that contains the values for the following keys:
 7478            :return: a string that contains the HGVS names associated with the given row of data.
 7479            """
 7480
 7481            chr = row["CHROM"]
 7482            pos = row["POS"]
 7483            ref = row["REF"]
 7484            alt = row["ALT"]
 7485
 7486            # Find list of associated transcripts
 7487            transcripts_list = list(
 7488                polars_conn.execute(
 7489                    f"""
 7490                SELECT transcript
 7491                FROM refseq_df
 7492                WHERE CHROM='{chr}'
 7493                AND POS={pos}
 7494            """
 7495                )["transcript"]
 7496            )
 7497
 7498            # Full HGVS annotation in list
 7499            hgvs_full_list = []
 7500
 7501            for transcript_name in transcripts_list:
 7502
 7503                # Transcript
 7504                transcript = get_transcript(
 7505                    transcripts=transcripts, transcript_name=transcript_name
 7506                )
 7507                # Exon
 7508                if use_exon:
 7509                    exon = transcript.find_exon_number(pos)
 7510                else:
 7511                    exon = None
 7512                # Protein
 7513                transcript_protein = None
 7514                if use_protein or add_protein or full_format:
 7515                    transcripts_protein = list(
 7516                        polars_conn.execute(
 7517                            f"""
 7518                        SELECT protein
 7519                        FROM refseqlink_df
 7520                        WHERE transcript='{transcript_name}'
 7521                        LIMIT 1
 7522                    """
 7523                        )["protein"]
 7524                    )
 7525                    if len(transcripts_protein):
 7526                        transcript_protein = transcripts_protein[0]
 7527
 7528                # HGVS name
 7529                hgvs_name = format_hgvs_name(
 7530                    chr,
 7531                    pos,
 7532                    ref,
 7533                    alt,
 7534                    genome=genome,
 7535                    transcript=transcript,
 7536                    transcript_protein=transcript_protein,
 7537                    exon=exon,
 7538                    use_gene=use_gene,
 7539                    use_protein=use_protein,
 7540                    full_format=full_format,
 7541                    use_version=use_version,
 7542                    codon_type=codon_type,
 7543                )
 7544                hgvs_full_list.append(hgvs_name)
 7545                if add_protein and not use_protein and not full_format:
 7546                    hgvs_name = format_hgvs_name(
 7547                        chr,
 7548                        pos,
 7549                        ref,
 7550                        alt,
 7551                        genome=genome,
 7552                        transcript=transcript,
 7553                        transcript_protein=transcript_protein,
 7554                        exon=exon,
 7555                        use_gene=use_gene,
 7556                        use_protein=True,
 7557                        full_format=False,
 7558                        use_version=use_version,
 7559                        codon_type=codon_type,
 7560                    )
 7561                    hgvs_full_list.append(hgvs_name)
 7562
 7563            # Create liste of HGVS annotations
 7564            hgvs_full = ",".join(hgvs_full_list)
 7565
 7566            return hgvs_full
 7567
 7568        # Polars connexion
 7569        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7570
 7571        # Config
 7572        config = self.get_config()
 7573
 7574        # Databases
 7575        # Genome
 7576        databases_genomes_folders = (
 7577            config.get("folders", {})
 7578            .get("databases", {})
 7579            .get("genomes", DEFAULT_GENOME_FOLDER)
 7580        )
 7581        databases_genome = (
 7582            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7583        )
 7584        # refseq database folder
 7585        databases_refseq_folders = (
 7586            config.get("folders", {})
 7587            .get("databases", {})
 7588            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7589        )
 7590        # refseq
 7591        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7592        # refSeqLink
 7593        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7594
 7595        # Param
 7596        param = self.get_param()
 7597
 7598        # Quick HGVS
 7599        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7600            log.info(f"Quick HGVS Annotation:")
 7601            if not param.get("hgvs", None):
 7602                param["hgvs"] = {}
 7603            for option in param.get("hgvs_options", "").split(","):
 7604                option_var_val = option.split("=")
 7605                option_var = option_var_val[0]
 7606                if len(option_var_val) > 1:
 7607                    option_val = option_var_val[1]
 7608                else:
 7609                    option_val = "True"
 7610                if option_val.upper() in ["TRUE"]:
 7611                    option_val = True
 7612                elif option_val.upper() in ["FALSE"]:
 7613                    option_val = False
 7614                log.info(f"   {option_var}={option_val}")
 7615                param["hgvs"][option_var] = option_val
 7616
 7617        # Check if HGVS annotation enabled
 7618        if "hgvs" in param:
 7619            log.info(f"HGVS Annotation... ")
 7620            for hgvs_option in param.get("hgvs", {}):
 7621                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7622        else:
 7623            return
 7624
 7625        # HGVS Param
 7626        param_hgvs = param.get("hgvs", {})
 7627        use_exon = param_hgvs.get("use_exon", False)
 7628        use_gene = param_hgvs.get("use_gene", False)
 7629        use_protein = param_hgvs.get("use_protein", False)
 7630        add_protein = param_hgvs.get("add_protein", False)
 7631        full_format = param_hgvs.get("full_format", False)
 7632        use_version = param_hgvs.get("use_version", False)
 7633        codon_type = param_hgvs.get("codon_type", "3")
 7634
 7635        # refSseq refSeqLink
 7636        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7637        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7638
 7639        # Assembly
 7640        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7641
 7642        # Genome
 7643        genome_file = None
 7644        if find_genome(databases_genome):
 7645            genome_file = find_genome(databases_genome)
 7646        else:
 7647            genome_file = find_genome(
 7648                genome_path=databases_genomes_folders, assembly=assembly
 7649            )
 7650        log.debug("Genome: " + str(genome_file))
 7651
 7652        # refSseq
 7653        refseq_file = find_file_prefix(
 7654            input_file=databases_refseq,
 7655            prefix="ncbiRefSeq",
 7656            folder=databases_refseq_folders,
 7657            assembly=assembly,
 7658        )
 7659        log.debug("refSeq: " + str(refseq_file))
 7660
 7661        # refSeqLink
 7662        refseqlink_file = find_file_prefix(
 7663            input_file=databases_refseqlink,
 7664            prefix="ncbiRefSeqLink",
 7665            folder=databases_refseq_folders,
 7666            assembly=assembly,
 7667        )
 7668        log.debug("refSeqLink: " + str(refseqlink_file))
 7669
 7670        # Threads
 7671        if not threads:
 7672            threads = self.get_threads()
 7673        log.debug("Threads: " + str(threads))
 7674
 7675        # Variables
 7676        table_variants = self.get_table_variants(clause="update")
 7677
 7678        # Get variants SNV and InDel only
 7679        query_variants = f"""
 7680            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7681            FROM {table_variants}
 7682            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7683            """
 7684        df_variants = self.get_query_to_df(query_variants)
 7685
 7686        # Added columns
 7687        added_columns = []
 7688
 7689        # Add hgvs column in variants table
 7690        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7691        added_column = self.add_column(
 7692            table_variants, hgvs_column_name, "STRING", default_value=None
 7693        )
 7694        added_columns.append(added_column)
 7695
 7696        log.debug(f"refSeq loading...")
 7697        # refSeq in duckDB
 7698        refseq_table = get_refseq_table(
 7699            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7700        )
 7701        # Loading all refSeq in Dataframe
 7702        refseq_query = f"""
 7703            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7704            FROM {refseq_table}
 7705            JOIN df_variants ON (
 7706                {refseq_table}.chrom = df_variants.CHROM
 7707                AND {refseq_table}.txStart<=df_variants.POS
 7708                AND {refseq_table}.txEnd>=df_variants.POS
 7709            )
 7710        """
 7711        refseq_df = self.conn.query(refseq_query).pl()
 7712
 7713        if refseqlink_file:
 7714            log.debug(f"refSeqLink loading...")
 7715            # refSeqLink in duckDB
 7716            refseqlink_table = get_refseq_table(
 7717                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7718            )
 7719            # Loading all refSeqLink in Dataframe
 7720            protacc_column = "protAcc_with_ver"
 7721            mrnaacc_column = "mrnaAcc_with_ver"
 7722            refseqlink_query = f"""
 7723                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7724                FROM {refseqlink_table} 
 7725                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7726                WHERE protAcc_without_ver IS NOT NULL
 7727            """
 7728            # Polars Dataframe
 7729            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7730
 7731        # Read RefSeq transcripts into a python dict/model.
 7732        log.debug(f"Transcripts loading...")
 7733        with tempfile.TemporaryDirectory() as tmpdir:
 7734            transcripts_query = f"""
 7735                COPY (
 7736                    SELECT {refseq_table}.*
 7737                    FROM {refseq_table}
 7738                    JOIN df_variants ON (
 7739                        {refseq_table}.chrom=df_variants.CHROM
 7740                        AND {refseq_table}.txStart<=df_variants.POS
 7741                        AND {refseq_table}.txEnd>=df_variants.POS
 7742                    )
 7743                )
 7744                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7745            """
 7746            self.conn.query(transcripts_query)
 7747            with open(f"{tmpdir}/transcript.tsv") as infile:
 7748                transcripts = read_transcripts(infile)
 7749
 7750        # Polars connexion
 7751        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7752
 7753        log.debug("Genome loading...")
 7754        # Read genome sequence using pyfaidx.
 7755        genome = Fasta(genome_file)
 7756
 7757        log.debug("Start annotation HGVS...")
 7758
 7759        # Create
 7760        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7761        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7762
 7763        # Use dask.dataframe.apply() to apply function on each partition
 7764        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7765
 7766        # Convert Dask DataFrame to Pandas Dataframe
 7767        df = ddf.compute()
 7768
 7769        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7770        with tempfile.TemporaryDirectory() as tmpdir:
 7771            df_parquet = os.path.join(tmpdir, "df.parquet")
 7772            df.to_parquet(df_parquet)
 7773
 7774            # Update hgvs column
 7775            update_variant_query = f"""
 7776                UPDATE {table_variants}
 7777                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7778                FROM read_parquet('{df_parquet}') as df
 7779                WHERE variants."#CHROM" = df.CHROM
 7780                AND variants.POS = df.POS
 7781                AND variants.REF = df.REF
 7782                AND variants.ALT = df.ALT
 7783                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7784                """
 7785            self.execute_query(update_variant_query)
 7786
 7787        # Update INFO column
 7788        sql_query_update = f"""
 7789            UPDATE {table_variants}
 7790            SET INFO = 
 7791                concat(
 7792                    CASE 
 7793                        WHEN INFO NOT IN ('','.')
 7794                        THEN concat(INFO, ';')
 7795                        ELSE ''
 7796                    END,
 7797                    'hgvs=',
 7798                    {hgvs_column_name}
 7799                )
 7800            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7801            """
 7802        self.execute_query(sql_query_update)
 7803
 7804        # Add header
 7805        HGVS_INFOS = {
 7806            "hgvs": {
 7807                "ID": "hgvs",
 7808                "Number": ".",
 7809                "Type": "String",
 7810                "Description": f"HGVS annotatation with HOWARD",
 7811            }
 7812        }
 7813
 7814        for field in HGVS_INFOS:
 7815            field_ID = HGVS_INFOS[field]["ID"]
 7816            field_description = HGVS_INFOS[field]["Description"]
 7817            self.get_header().infos[field_ID] = vcf.parser._Info(
 7818                field_ID,
 7819                HGVS_INFOS[field]["Number"],
 7820                HGVS_INFOS[field]["Type"],
 7821                field_description,
 7822                "unknown",
 7823                "unknown",
 7824                code_type_map[HGVS_INFOS[field]["Type"]],
 7825            )
 7826
 7827        # Remove added columns
 7828        for added_column in added_columns:
 7829            self.drop_column(column=added_column)
 7830
 7831    ###
 7832    # Calculation
 7833    ###
 7834
 7835    def get_operations_help(
 7836        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7837    ) -> list:
 7838
 7839        # Init
 7840        operations_help = []
 7841
 7842        # operations
 7843        operations = self.get_config_json(
 7844            name="calculations",
 7845            config_dict=operations_config_dict,
 7846            config_file=operations_config_file,
 7847        )
 7848        for op in operations:
 7849            op_name = operations[op].get("name", op).upper()
 7850            op_description = operations[op].get("description", op_name)
 7851            op_available = operations[op].get("available", False)
 7852            if op_available:
 7853                operations_help.append(f"   {op_name}: {op_description}")
 7854
 7855        # Sort operations
 7856        operations_help.sort()
 7857
 7858        # insert header
 7859        operations_help.insert(0, "Available calculation operations:")
 7860
 7861        # Return
 7862        return operations_help
 7863
 7864    def calculation(
 7865        self,
 7866        operations: dict = {},
 7867        operations_config_dict: dict = {},
 7868        operations_config_file: str = None,
 7869    ) -> None:
 7870        """
 7871        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7872        operation, and then calls the appropriate function
 7873
 7874        param json example:
 7875            "calculation": {
 7876                "NOMEN": {
 7877                    "options": {
 7878                        "hgvs_field": "hgvs"
 7879                    },
 7880                "middle" : null
 7881            }
 7882        """
 7883
 7884        # Param
 7885        param = self.get_param()
 7886
 7887        # operations config
 7888        operations_config = self.get_config_json(
 7889            name="calculations",
 7890            config_dict=operations_config_dict,
 7891            config_file=operations_config_file,
 7892        )
 7893
 7894        # Upper keys
 7895        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7896
 7897        # Calculations
 7898
 7899        # Operations from param
 7900        operations = param.get("calculation", {}).get("calculations", operations)
 7901
 7902        # Quick calculation - add
 7903        if param.get("calculations", None):
 7904            calculations_list = [
 7905                value for value in param.get("calculations", "").split(",")
 7906            ]
 7907            log.info(f"Quick Calculations:")
 7908            for calculation_key in calculations_list:
 7909                log.info(f"   {calculation_key}")
 7910            for calculation_operation in calculations_list:
 7911                if calculation_operation.upper() not in operations:
 7912                    operations[calculation_operation.upper()] = {}
 7913                    add_value_into_dict(
 7914                        dict_tree=param,
 7915                        sections=[
 7916                            "calculation",
 7917                            "calculations",
 7918                            calculation_operation.upper(),
 7919                        ],
 7920                        value={},
 7921                    )
 7922
 7923        # Operations for calculation
 7924        if not operations:
 7925            operations = param.get("calculation", {}).get("calculations", {})
 7926
 7927        if operations:
 7928            log.info(f"Calculations...")
 7929
 7930        # For each operations
 7931        for operation_name in operations:
 7932            operation_name = operation_name.upper()
 7933            if operation_name not in [""]:
 7934                if operation_name in operations_config:
 7935                    log.info(f"Calculation '{operation_name}'")
 7936                    operation = operations_config[operation_name]
 7937                    operation_type = operation.get("type", "sql")
 7938                    if operation_type == "python":
 7939                        self.calculation_process_function(
 7940                            operation=operation, operation_name=operation_name
 7941                        )
 7942                    elif operation_type == "sql":
 7943                        self.calculation_process_sql(
 7944                            operation=operation, operation_name=operation_name
 7945                        )
 7946                    else:
 7947                        log.error(
 7948                            f"Operations config: Type '{operation_type}' NOT available"
 7949                        )
 7950                        raise ValueError(
 7951                            f"Operations config: Type '{operation_type}' NOT available"
 7952                        )
 7953                else:
 7954                    log.error(
 7955                        f"Operations config: Calculation '{operation_name}' NOT available"
 7956                    )
 7957                    raise ValueError(
 7958                        f"Operations config: Calculation '{operation_name}' NOT available"
 7959                    )
 7960
 7961        # Explode INFOS fields into table fields
 7962        if self.get_explode_infos():
 7963            self.explode_infos(
 7964                prefix=self.get_explode_infos_prefix(),
 7965                fields=self.get_explode_infos_fields(),
 7966                force=True,
 7967            )
 7968
 7969    def calculation_process_sql(
 7970        self, operation: dict, operation_name: str = "unknown"
 7971    ) -> None:
 7972        """
 7973        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7974        performs the operation, updating the specified table with the result.
 7975
 7976        :param operation: The `operation` parameter is a dictionary that contains information about the
 7977        mathematical operation to be performed. It includes the following keys:
 7978        :type operation: dict
 7979        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7980        the mathematical operation being performed. It is used for logging and error handling purposes,
 7981        defaults to unknown
 7982        :type operation_name: str (optional)
 7983        """
 7984
 7985        # table variants
 7986        table_variants = self.get_table_variants(clause="alter")
 7987
 7988        # Operation infos
 7989        operation_name = operation.get("name", "unknown")
 7990        log.debug(f"process sql {operation_name}")
 7991        output_column_name = operation.get("output_column_name", operation_name)
 7992        output_column_type = operation.get("output_column_type", "String")
 7993        prefix = operation.get("explode_infos_prefix", "")
 7994        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7995        output_column_description = operation.get(
 7996            "output_column_description", f"{operation_name} operation"
 7997        )
 7998        operation_query = operation.get("operation_query", None)
 7999        if isinstance(operation_query, list):
 8000            operation_query = " ".join(operation_query)
 8001        operation_info_fields = operation.get("info_fields", [])
 8002        operation_info_fields_check = operation.get("info_fields_check", False)
 8003        operation_info = operation.get("operation_info", True)
 8004
 8005        if operation_query:
 8006
 8007            # Info fields check
 8008            operation_info_fields_check_result = True
 8009            if operation_info_fields_check:
 8010                header_infos = self.get_header().infos
 8011                for info_field in operation_info_fields:
 8012                    operation_info_fields_check_result = (
 8013                        operation_info_fields_check_result
 8014                        and info_field in header_infos
 8015                    )
 8016
 8017            # If info fields available
 8018            if operation_info_fields_check_result:
 8019
 8020                # Added_columns
 8021                added_columns = []
 8022
 8023                # Create VCF header field
 8024                vcf_reader = self.get_header()
 8025                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8026                    output_column_name,
 8027                    ".",
 8028                    output_column_type,
 8029                    output_column_description,
 8030                    "howard calculation",
 8031                    "0",
 8032                    self.code_type_map.get(output_column_type),
 8033                )
 8034
 8035                # Explode infos if needed
 8036                log.debug(f"calculation_process_sql prefix {prefix}")
 8037                added_columns += self.explode_infos(
 8038                    prefix=prefix,
 8039                    fields=[output_column_name] + operation_info_fields,
 8040                    force=True,
 8041                )
 8042
 8043                # Create column
 8044                added_column = self.add_column(
 8045                    table_name=table_variants,
 8046                    column_name=prefix + output_column_name,
 8047                    column_type=output_column_type_sql,
 8048                    default_value="null",
 8049                )
 8050                added_columns.append(added_column)
 8051
 8052                # Operation calculation
 8053                try:
 8054
 8055                    # Query to update calculation column
 8056                    sql_update = f"""
 8057                        UPDATE {table_variants}
 8058                        SET "{prefix}{output_column_name}" = ({operation_query})
 8059                    """
 8060                    self.conn.execute(sql_update)
 8061
 8062                    # Add to INFO
 8063                    if operation_info:
 8064                        sql_update_info = f"""
 8065                            UPDATE {table_variants}
 8066                            SET "INFO" =
 8067                                concat(
 8068                                    CASE
 8069                                        WHEN "INFO" IS NOT NULL
 8070                                        THEN concat("INFO", ';')
 8071                                        ELSE ''
 8072                                    END,
 8073                                    '{output_column_name}=',
 8074                                    "{prefix}{output_column_name}"
 8075                                )
 8076                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8077                        """
 8078                        self.conn.execute(sql_update_info)
 8079
 8080                except:
 8081                    log.error(
 8082                        f"Operations config: Calculation '{operation_name}' query failed"
 8083                    )
 8084                    raise ValueError(
 8085                        f"Operations config: Calculation '{operation_name}' query failed"
 8086                    )
 8087
 8088                # Remove added columns
 8089                for added_column in added_columns:
 8090                    log.debug(f"added_column: {added_column}")
 8091                    self.drop_column(column=added_column)
 8092
 8093            else:
 8094                log.error(
 8095                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8096                )
 8097                raise ValueError(
 8098                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8099                )
 8100
 8101        else:
 8102            log.error(
 8103                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8104            )
 8105            raise ValueError(
 8106                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8107            )
 8108
 8109    def calculation_process_function(
 8110        self, operation: dict, operation_name: str = "unknown"
 8111    ) -> None:
 8112        """
 8113        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8114        function with the given parameters.
 8115
 8116        :param operation: The `operation` parameter is a dictionary that contains information about the
 8117        operation to be performed. It has the following keys:
 8118        :type operation: dict
 8119        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8120        the operation being performed. It is used for logging purposes, defaults to unknown
 8121        :type operation_name: str (optional)
 8122        """
 8123
 8124        operation_name = operation["name"]
 8125        log.debug(f"process sql {operation_name}")
 8126        function_name = operation["function_name"]
 8127        function_params = operation["function_params"]
 8128        getattr(self, function_name)(*function_params)
 8129
 8130    def calculation_variant_id(self) -> None:
 8131        """
 8132        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8133        updates the INFO field of a variants table with the variant ID.
 8134        """
 8135
 8136        # variant_id annotation field
 8137        variant_id_tag = self.get_variant_id_column()
 8138        added_columns = [variant_id_tag]
 8139
 8140        # variant_id hgvs tags"
 8141        vcf_infos_tags = {
 8142            variant_id_tag: "howard variant ID annotation",
 8143        }
 8144
 8145        # Variants table
 8146        table_variants = self.get_table_variants()
 8147
 8148        # Header
 8149        vcf_reader = self.get_header()
 8150
 8151        # Add variant_id to header
 8152        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8153            variant_id_tag,
 8154            ".",
 8155            "String",
 8156            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8157            "howard calculation",
 8158            "0",
 8159            self.code_type_map.get("String"),
 8160        )
 8161
 8162        # Update
 8163        sql_update = f"""
 8164            UPDATE {table_variants}
 8165            SET "INFO" = 
 8166                concat(
 8167                    CASE
 8168                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8169                        THEN ''
 8170                        ELSE concat("INFO", ';')
 8171                    END,
 8172                    '{variant_id_tag}=',
 8173                    "{variant_id_tag}"
 8174                )
 8175        """
 8176        self.conn.execute(sql_update)
 8177
 8178        # Remove added columns
 8179        for added_column in added_columns:
 8180            self.drop_column(column=added_column)
 8181
 8182    def calculation_extract_snpeff_hgvs(
 8183        self,
 8184        snpeff_hgvs: str = "snpeff_hgvs",
 8185        snpeff_field: str = "ANN",
 8186    ) -> None:
 8187        """
 8188        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8189        annotation field in a VCF file and adds them as a new column in the variants table.
 8190
 8191        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8192        function is used to specify the name of the column that will store the HGVS nomenclatures
 8193        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8194        snpeff_hgvs
 8195        :type snpeff_hgvs: str (optional)
 8196        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8197        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8198        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8199        to ANN
 8200        :type snpeff_field: str (optional)
 8201        """
 8202
 8203        # Snpeff hgvs tags
 8204        vcf_infos_tags = {
 8205            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8206        }
 8207
 8208        # Prefix
 8209        prefix = self.get_explode_infos_prefix()
 8210        if prefix:
 8211            prefix = "INFO/"
 8212
 8213        # snpEff fields
 8214        speff_ann_infos = prefix + snpeff_field
 8215        speff_hgvs_infos = prefix + snpeff_hgvs
 8216
 8217        # Variants table
 8218        table_variants = self.get_table_variants()
 8219
 8220        # Header
 8221        vcf_reader = self.get_header()
 8222
 8223        # Add columns
 8224        added_columns = []
 8225
 8226        # Explode HGVS field in column
 8227        added_columns += self.explode_infos(fields=[snpeff_field])
 8228
 8229        if snpeff_field in vcf_reader.infos:
 8230
 8231            log.debug(vcf_reader.infos[snpeff_field])
 8232
 8233            # Extract ANN header
 8234            ann_description = vcf_reader.infos[snpeff_field].desc
 8235            pattern = r"'(.+?)'"
 8236            match = re.search(pattern, ann_description)
 8237            if match:
 8238                ann_header_match = match.group(1).split(" | ")
 8239                ann_header_desc = {}
 8240                for i in range(len(ann_header_match)):
 8241                    ann_header_info = "".join(
 8242                        char for char in ann_header_match[i] if char.isalnum()
 8243                    )
 8244                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8245                if not ann_header_desc:
 8246                    raise ValueError("Invalid header description format")
 8247            else:
 8248                raise ValueError("Invalid header description format")
 8249
 8250            # Create variant id
 8251            variant_id_column = self.get_variant_id_column()
 8252            added_columns += [variant_id_column]
 8253
 8254            # Create dataframe
 8255            dataframe_snpeff_hgvs = self.get_query_to_df(
 8256                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8257            )
 8258
 8259            # Create main NOMEN column
 8260            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8261                speff_ann_infos
 8262            ].apply(
 8263                lambda x: extract_snpeff_hgvs(
 8264                    str(x), header=list(ann_header_desc.values())
 8265                )
 8266            )
 8267
 8268            # Add snpeff_hgvs to header
 8269            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8270                snpeff_hgvs,
 8271                ".",
 8272                "String",
 8273                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8274                "howard calculation",
 8275                "0",
 8276                self.code_type_map.get("String"),
 8277            )
 8278
 8279            # Update
 8280            sql_update = f"""
 8281                UPDATE variants
 8282                SET "INFO" = 
 8283                    concat(
 8284                        CASE
 8285                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8286                            THEN ''
 8287                            ELSE concat("INFO", ';')
 8288                        END,
 8289                        CASE 
 8290                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8291                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8292                            THEN concat(
 8293                                    '{snpeff_hgvs}=',
 8294                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8295                                )
 8296                            ELSE ''
 8297                        END
 8298                    )
 8299                FROM dataframe_snpeff_hgvs
 8300                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8301
 8302            """
 8303            self.conn.execute(sql_update)
 8304
 8305            # Delete dataframe
 8306            del dataframe_snpeff_hgvs
 8307            gc.collect()
 8308
 8309        else:
 8310
 8311            log.warning(
 8312                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8313            )
 8314
 8315        # Remove added columns
 8316        for added_column in added_columns:
 8317            self.drop_column(column=added_column)
 8318
 8319    def calculation_snpeff_ann_explode(
 8320        self,
 8321        uniquify: bool = True,
 8322        output_format: str = "fields",
 8323        output_prefix: str = "snpeff_",
 8324        snpeff_field: str = "ANN",
 8325    ) -> None:
 8326        """
 8327        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8328        exploding the HGVS field and updating variant information accordingly.
 8329
 8330        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8331        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8332        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8333        defaults to True
 8334        :type uniquify: bool (optional)
 8335        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8336        function specifies the format in which the output annotations will be generated. It has a
 8337        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8338        format, defaults to fields
 8339        :type output_format: str (optional)
 8340        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8341        method is used to specify the prefix that will be added to the output annotations generated
 8342        during the calculation process. This prefix helps to differentiate the newly added annotations
 8343        from existing ones in the output data. By default, the, defaults to ANN_
 8344        :type output_prefix: str (optional)
 8345        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8346        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8347        field will be processed to explode the HGVS annotations and update the variant information
 8348        accordingly, defaults to ANN
 8349        :type snpeff_field: str (optional)
 8350        """
 8351
 8352        # SnpEff annotation field
 8353        snpeff_hgvs = "snpeff_ann_explode"
 8354
 8355        # Snpeff hgvs tags
 8356        vcf_infos_tags = {
 8357            snpeff_hgvs: "Explode snpEff annotations",
 8358        }
 8359
 8360        # Prefix
 8361        prefix = self.get_explode_infos_prefix()
 8362        if prefix:
 8363            prefix = "INFO/"
 8364
 8365        # snpEff fields
 8366        speff_ann_infos = prefix + snpeff_field
 8367        speff_hgvs_infos = prefix + snpeff_hgvs
 8368
 8369        # Variants table
 8370        table_variants = self.get_table_variants()
 8371
 8372        # Header
 8373        vcf_reader = self.get_header()
 8374
 8375        # Add columns
 8376        added_columns = []
 8377
 8378        # Explode HGVS field in column
 8379        added_columns += self.explode_infos(fields=[snpeff_field])
 8380        log.debug(f"snpeff_field={snpeff_field}")
 8381        log.debug(f"added_columns={added_columns}")
 8382
 8383        if snpeff_field in vcf_reader.infos:
 8384
 8385            # Extract ANN header
 8386            ann_description = vcf_reader.infos[snpeff_field].desc
 8387            pattern = r"'(.+?)'"
 8388            match = re.search(pattern, ann_description)
 8389            if match:
 8390                ann_header_match = match.group(1).split(" | ")
 8391                ann_header = []
 8392                ann_header_desc = {}
 8393                for i in range(len(ann_header_match)):
 8394                    ann_header_info = "".join(
 8395                        char for char in ann_header_match[i] if char.isalnum()
 8396                    )
 8397                    ann_header.append(ann_header_info)
 8398                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8399                if not ann_header_desc:
 8400                    raise ValueError("Invalid header description format")
 8401            else:
 8402                raise ValueError("Invalid header description format")
 8403
 8404            # Create variant id
 8405            variant_id_column = self.get_variant_id_column()
 8406            added_columns += [variant_id_column]
 8407
 8408            # Create dataframe
 8409            dataframe_snpeff_hgvs = self.get_query_to_df(
 8410                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8411            )
 8412
 8413            # Create snpEff columns
 8414            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8415                speff_ann_infos
 8416            ].apply(
 8417                lambda x: explode_snpeff_ann(
 8418                    str(x),
 8419                    uniquify=uniquify,
 8420                    output_format=output_format,
 8421                    prefix=output_prefix,
 8422                    header=list(ann_header_desc.values()),
 8423                )
 8424            )
 8425
 8426            # Header
 8427            ann_annotations_prefix = ""
 8428            if output_format.upper() in ["JSON"]:
 8429                ann_annotations_prefix = f"{output_prefix}="
 8430                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8431                    output_prefix,
 8432                    ".",
 8433                    "String",
 8434                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8435                    + " - JSON format",
 8436                    "howard calculation",
 8437                    "0",
 8438                    self.code_type_map.get("String"),
 8439                )
 8440            else:
 8441                for ann_annotation in ann_header:
 8442                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8443                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8444                        ann_annotation_id,
 8445                        ".",
 8446                        "String",
 8447                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8448                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8449                        "howard calculation",
 8450                        "0",
 8451                        self.code_type_map.get("String"),
 8452                    )
 8453
 8454            # Update
 8455            sql_update = f"""
 8456                UPDATE variants
 8457                SET "INFO" = 
 8458                    concat(
 8459                        CASE
 8460                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8461                            THEN ''
 8462                            ELSE concat("INFO", ';')
 8463                        END,
 8464                        CASE 
 8465                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8466                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8467                            THEN concat(
 8468                                '{ann_annotations_prefix}',
 8469                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8470                                )
 8471                            ELSE ''
 8472                        END
 8473                    )
 8474                FROM dataframe_snpeff_hgvs
 8475                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8476
 8477            """
 8478            self.conn.execute(sql_update)
 8479
 8480            # Delete dataframe
 8481            del dataframe_snpeff_hgvs
 8482            gc.collect()
 8483
 8484        else:
 8485
 8486            log.warning(
 8487                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8488            )
 8489
 8490        # Remove added columns
 8491        for added_column in added_columns:
 8492            self.drop_column(column=added_column)
 8493
 8494    def calculation_extract_nomen(self) -> None:
 8495        """
 8496        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8497        """
 8498
 8499        # NOMEN field
 8500        field_nomen_dict = "NOMEN_DICT"
 8501
 8502        # NOMEN structure
 8503        nomen_dict = {
 8504            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8505            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8506            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8507            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8508            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8509            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8510            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8511            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8512            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8513            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8514        }
 8515
 8516        # Param
 8517        param = self.get_param()
 8518
 8519        # Prefix
 8520        prefix = self.get_explode_infos_prefix()
 8521
 8522        # Header
 8523        vcf_reader = self.get_header()
 8524
 8525        # Get HGVS field
 8526        hgvs_field = (
 8527            param.get("calculation", {})
 8528            .get("calculations", {})
 8529            .get("NOMEN", {})
 8530            .get("options", {})
 8531            .get("hgvs_field", "hgvs")
 8532        )
 8533
 8534        # Get transcripts
 8535        transcripts_file = (
 8536            param.get("calculation", {})
 8537            .get("calculations", {})
 8538            .get("NOMEN", {})
 8539            .get("options", {})
 8540            .get("transcripts", None)
 8541        )
 8542        transcripts_file = full_path(transcripts_file)
 8543        transcripts = []
 8544        if transcripts_file:
 8545            if os.path.exists(transcripts_file):
 8546                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8547                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8548            else:
 8549                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8550                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8551
 8552        # Added columns
 8553        added_columns = []
 8554
 8555        # Explode HGVS field in column
 8556        added_columns += self.explode_infos(fields=[hgvs_field])
 8557
 8558        # extra infos
 8559        extra_infos = self.get_extra_infos()
 8560        extra_field = prefix + hgvs_field
 8561
 8562        if extra_field in extra_infos:
 8563
 8564            # Create dataframe
 8565            dataframe_hgvs = self.get_query_to_df(
 8566                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8567            )
 8568
 8569            # Create main NOMEN column
 8570            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8571                lambda x: find_nomen(str(x), transcripts=transcripts)
 8572            )
 8573
 8574            # Explode NOMEN Structure and create SQL set for update
 8575            sql_nomen_fields = []
 8576            for nomen_field in nomen_dict:
 8577
 8578                # Explode each field into a column
 8579                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8580                    lambda x: dict(x).get(nomen_field, "")
 8581                )
 8582
 8583                # Create VCF header field
 8584                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8585                    nomen_field,
 8586                    ".",
 8587                    "String",
 8588                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8589                    "howard calculation",
 8590                    "0",
 8591                    self.code_type_map.get("String"),
 8592                )
 8593                sql_nomen_fields.append(
 8594                    f"""
 8595                        CASE 
 8596                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8597                            THEN concat(
 8598                                    ';{nomen_field}=',
 8599                                    dataframe_hgvs."{nomen_field}"
 8600                                )
 8601                            ELSE ''
 8602                        END
 8603                    """
 8604                )
 8605
 8606            # SQL set for update
 8607            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8608
 8609            # Update
 8610            sql_update = f"""
 8611                UPDATE variants
 8612                SET "INFO" = 
 8613                    concat(
 8614                        CASE
 8615                            WHEN "INFO" IS NULL
 8616                            THEN ''
 8617                            ELSE "INFO"
 8618                        END,
 8619                        {sql_nomen_fields_set}
 8620                    )
 8621                FROM dataframe_hgvs
 8622                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8623                    AND variants."POS" = dataframe_hgvs."POS" 
 8624                    AND variants."REF" = dataframe_hgvs."REF"
 8625                    AND variants."ALT" = dataframe_hgvs."ALT"
 8626            """
 8627            self.conn.execute(sql_update)
 8628
 8629            # Delete dataframe
 8630            del dataframe_hgvs
 8631            gc.collect()
 8632
 8633        # Remove added columns
 8634        for added_column in added_columns:
 8635            self.drop_column(column=added_column)
 8636
 8637    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8638        """
 8639        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8640        pipeline/sample for a variant and updates the variant information in a VCF file.
 8641
 8642        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8643        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8644        VCF header and to update the corresponding field in the variants table, defaults to
 8645        findbypipeline
 8646        :type tag: str (optional)
 8647        """
 8648
 8649        # if FORMAT and samples
 8650        if (
 8651            "FORMAT" in self.get_header_columns_as_list()
 8652            and self.get_header_sample_list()
 8653        ):
 8654
 8655            # findbypipeline annotation field
 8656            findbypipeline_tag = tag
 8657
 8658            # VCF infos tags
 8659            vcf_infos_tags = {
 8660                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8661            }
 8662
 8663            # Prefix
 8664            prefix = self.get_explode_infos_prefix()
 8665
 8666            # Field
 8667            findbypipeline_infos = prefix + findbypipeline_tag
 8668
 8669            # Variants table
 8670            table_variants = self.get_table_variants()
 8671
 8672            # Header
 8673            vcf_reader = self.get_header()
 8674
 8675            # Create variant id
 8676            variant_id_column = self.get_variant_id_column()
 8677            added_columns = [variant_id_column]
 8678
 8679            # variant_id, FORMAT and samples
 8680            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8681                self.get_header_sample_list()
 8682            )
 8683
 8684            # Create dataframe
 8685            dataframe_findbypipeline = self.get_query_to_df(
 8686                f""" SELECT {samples_fields} FROM {table_variants} """
 8687            )
 8688
 8689            # Create findbypipeline column
 8690            dataframe_findbypipeline[findbypipeline_infos] = (
 8691                dataframe_findbypipeline.apply(
 8692                    lambda row: findbypipeline(
 8693                        row, samples=self.get_header_sample_list()
 8694                    ),
 8695                    axis=1,
 8696                )
 8697            )
 8698
 8699            # Add snpeff_hgvs to header
 8700            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8701                findbypipeline_tag,
 8702                ".",
 8703                "String",
 8704                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8705                "howard calculation",
 8706                "0",
 8707                self.code_type_map.get("String"),
 8708            )
 8709
 8710            # Update
 8711            sql_update = f"""
 8712                UPDATE variants
 8713                SET "INFO" = 
 8714                    concat(
 8715                        CASE
 8716                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8717                            THEN ''
 8718                            ELSE concat("INFO", ';')
 8719                        END,
 8720                        CASE 
 8721                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8722                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8723                            THEN concat(
 8724                                    '{findbypipeline_tag}=',
 8725                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8726                                )
 8727                            ELSE ''
 8728                        END
 8729                    )
 8730                FROM dataframe_findbypipeline
 8731                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8732            """
 8733            self.conn.execute(sql_update)
 8734
 8735            # Remove added columns
 8736            for added_column in added_columns:
 8737                self.drop_column(column=added_column)
 8738
 8739            # Delete dataframe
 8740            del dataframe_findbypipeline
 8741            gc.collect()
 8742
 8743    def calculation_genotype_concordance(self) -> None:
 8744        """
 8745        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8746        multi-caller VCF files and updates the variant information in the database.
 8747        """
 8748
 8749        # if FORMAT and samples
 8750        if (
 8751            "FORMAT" in self.get_header_columns_as_list()
 8752            and self.get_header_sample_list()
 8753        ):
 8754
 8755            # genotypeconcordance annotation field
 8756            genotypeconcordance_tag = "genotypeconcordance"
 8757
 8758            # VCF infos tags
 8759            vcf_infos_tags = {
 8760                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8761            }
 8762
 8763            # Prefix
 8764            prefix = self.get_explode_infos_prefix()
 8765
 8766            # Field
 8767            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8768
 8769            # Variants table
 8770            table_variants = self.get_table_variants()
 8771
 8772            # Header
 8773            vcf_reader = self.get_header()
 8774
 8775            # Create variant id
 8776            variant_id_column = self.get_variant_id_column()
 8777            added_columns = [variant_id_column]
 8778
 8779            # variant_id, FORMAT and samples
 8780            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8781                self.get_header_sample_list()
 8782            )
 8783
 8784            # Create dataframe
 8785            dataframe_genotypeconcordance = self.get_query_to_df(
 8786                f""" SELECT {samples_fields} FROM {table_variants} """
 8787            )
 8788
 8789            # Create genotypeconcordance column
 8790            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8791                dataframe_genotypeconcordance.apply(
 8792                    lambda row: genotypeconcordance(
 8793                        row, samples=self.get_header_sample_list()
 8794                    ),
 8795                    axis=1,
 8796                )
 8797            )
 8798
 8799            # Add genotypeconcordance to header
 8800            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8801                genotypeconcordance_tag,
 8802                ".",
 8803                "String",
 8804                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8805                "howard calculation",
 8806                "0",
 8807                self.code_type_map.get("String"),
 8808            )
 8809
 8810            # Update
 8811            sql_update = f"""
 8812                UPDATE variants
 8813                SET "INFO" = 
 8814                    concat(
 8815                        CASE
 8816                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8817                            THEN ''
 8818                            ELSE concat("INFO", ';')
 8819                        END,
 8820                        CASE
 8821                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8822                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8823                            THEN concat(
 8824                                    '{genotypeconcordance_tag}=',
 8825                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8826                                )
 8827                            ELSE ''
 8828                        END
 8829                    )
 8830                FROM dataframe_genotypeconcordance
 8831                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8832            """
 8833            self.conn.execute(sql_update)
 8834
 8835            # Remove added columns
 8836            for added_column in added_columns:
 8837                self.drop_column(column=added_column)
 8838
 8839            # Delete dataframe
 8840            del dataframe_genotypeconcordance
 8841            gc.collect()
 8842
 8843    def calculation_barcode(self, tag: str = "barcode") -> None:
 8844        """
 8845        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8846        updates the INFO field in the file with the calculated barcode values.
 8847
 8848        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8849        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8850        the default tag name is set to "barcode", defaults to barcode
 8851        :type tag: str (optional)
 8852        """
 8853
 8854        # if FORMAT and samples
 8855        if (
 8856            "FORMAT" in self.get_header_columns_as_list()
 8857            and self.get_header_sample_list()
 8858        ):
 8859
 8860            # barcode annotation field
 8861            if not tag:
 8862                tag = "barcode"
 8863
 8864            # VCF infos tags
 8865            vcf_infos_tags = {
 8866                tag: "barcode calculation (VaRank)",
 8867            }
 8868
 8869            # Prefix
 8870            prefix = self.get_explode_infos_prefix()
 8871
 8872            # Field
 8873            barcode_infos = prefix + tag
 8874
 8875            # Variants table
 8876            table_variants = self.get_table_variants()
 8877
 8878            # Header
 8879            vcf_reader = self.get_header()
 8880
 8881            # Create variant id
 8882            variant_id_column = self.get_variant_id_column()
 8883            added_columns = [variant_id_column]
 8884
 8885            # variant_id, FORMAT and samples
 8886            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8887                self.get_header_sample_list()
 8888            )
 8889
 8890            # Create dataframe
 8891            dataframe_barcode = self.get_query_to_df(
 8892                f""" SELECT {samples_fields} FROM {table_variants} """
 8893            )
 8894
 8895            # Create barcode column
 8896            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8897                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8898            )
 8899
 8900            # Add barcode to header
 8901            vcf_reader.infos[tag] = vcf.parser._Info(
 8902                tag,
 8903                ".",
 8904                "String",
 8905                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8906                "howard calculation",
 8907                "0",
 8908                self.code_type_map.get("String"),
 8909            )
 8910
 8911            # Update
 8912            sql_update = f"""
 8913                UPDATE {table_variants}
 8914                SET "INFO" = 
 8915                    concat(
 8916                        CASE
 8917                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8918                            THEN ''
 8919                            ELSE concat("INFO", ';')
 8920                        END,
 8921                        CASE
 8922                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8923                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8924                            THEN concat(
 8925                                    '{tag}=',
 8926                                    dataframe_barcode."{barcode_infos}"
 8927                                )
 8928                            ELSE ''
 8929                        END
 8930                    )
 8931                FROM dataframe_barcode
 8932                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8933            """
 8934            self.conn.execute(sql_update)
 8935
 8936            # Remove added columns
 8937            for added_column in added_columns:
 8938                self.drop_column(column=added_column)
 8939
 8940            # Delete dataframe
 8941            del dataframe_barcode
 8942            gc.collect()
 8943
 8944    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8945        """
 8946        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8947        and updates the INFO field in the file with the calculated barcode values.
 8948
 8949        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8950        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8951        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8952        :type tag: str (optional)
 8953        """
 8954
 8955        # if FORMAT and samples
 8956        if (
 8957            "FORMAT" in self.get_header_columns_as_list()
 8958            and self.get_header_sample_list()
 8959        ):
 8960
 8961            # barcode annotation field
 8962            if not tag:
 8963                tag = "BCF"
 8964
 8965            # VCF infos tags
 8966            vcf_infos_tags = {
 8967                tag: "barcode family calculation",
 8968                f"{tag}S": "barcode family samples",
 8969            }
 8970
 8971            # Param
 8972            param = self.get_param()
 8973            log.debug(f"param={param}")
 8974
 8975            # Prefix
 8976            prefix = self.get_explode_infos_prefix()
 8977
 8978            # PED param
 8979            ped = (
 8980                param.get("calculation", {})
 8981                .get("calculations", {})
 8982                .get("BARCODEFAMILY", {})
 8983                .get("family_pedigree", None)
 8984            )
 8985            log.debug(f"ped={ped}")
 8986
 8987            # Load PED
 8988            if ped:
 8989
 8990                # Pedigree is a file
 8991                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8992                    log.debug("Pedigree is file")
 8993                    with open(full_path(ped)) as ped:
 8994                        ped = json.load(ped)
 8995
 8996                # Pedigree is a string
 8997                elif isinstance(ped, str):
 8998                    log.debug("Pedigree is str")
 8999                    try:
 9000                        ped = json.loads(ped)
 9001                        log.debug("Pedigree is json str")
 9002                    except ValueError as e:
 9003                        ped_samples = ped.split(",")
 9004                        ped = {}
 9005                        for ped_sample in ped_samples:
 9006                            ped[ped_sample] = ped_sample
 9007
 9008                # Pedigree is a dict
 9009                elif isinstance(ped, dict):
 9010                    log.debug("Pedigree is dict")
 9011
 9012                # Pedigree is not well formatted
 9013                else:
 9014                    msg_error = "Pedigree not well formatted"
 9015                    log.error(msg_error)
 9016                    raise ValueError(msg_error)
 9017
 9018                # Construct list
 9019                ped_samples = list(ped.values())
 9020
 9021            else:
 9022                log.debug("Pedigree not defined. Take all samples")
 9023                ped_samples = self.get_header_sample_list()
 9024                ped = {}
 9025                for ped_sample in ped_samples:
 9026                    ped[ped_sample] = ped_sample
 9027
 9028            # Check pedigree
 9029            if not ped or len(ped) == 0:
 9030                msg_error = f"Error in pedigree: samples {ped_samples}"
 9031                log.error(msg_error)
 9032                raise ValueError(msg_error)
 9033
 9034            # Log
 9035            log.info(
 9036                "Calculation 'BARCODEFAMILY' - Samples: "
 9037                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9038            )
 9039            log.debug(f"ped_samples={ped_samples}")
 9040
 9041            # Field
 9042            barcode_infos = prefix + tag
 9043
 9044            # Variants table
 9045            table_variants = self.get_table_variants()
 9046
 9047            # Header
 9048            vcf_reader = self.get_header()
 9049
 9050            # Create variant id
 9051            variant_id_column = self.get_variant_id_column()
 9052            added_columns = [variant_id_column]
 9053
 9054            # variant_id, FORMAT and samples
 9055            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9056                ped_samples
 9057            )
 9058
 9059            # Create dataframe
 9060            dataframe_barcode = self.get_query_to_df(
 9061                f""" SELECT {samples_fields} FROM {table_variants} """
 9062            )
 9063
 9064            # Create barcode column
 9065            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9066                lambda row: barcode(row, samples=ped_samples), axis=1
 9067            )
 9068
 9069            # Add barcode family to header
 9070            # Add vaf_normalization to header
 9071            vcf_reader.formats[tag] = vcf.parser._Format(
 9072                id=tag,
 9073                num=".",
 9074                type="String",
 9075                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9076                type_code=self.code_type_map.get("String"),
 9077            )
 9078            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9079                id=f"{tag}S",
 9080                num=".",
 9081                type="String",
 9082                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9083                type_code=self.code_type_map.get("String"),
 9084            )
 9085
 9086            # Update
 9087            # for sample in ped_samples:
 9088            sql_update_set = []
 9089            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9090                if sample in ped_samples:
 9091                    value = f'dataframe_barcode."{barcode_infos}"'
 9092                    value_samples = "'" + ",".join(ped_samples) + "'"
 9093                elif sample == "FORMAT":
 9094                    value = f"'{tag}'"
 9095                    value_samples = f"'{tag}S'"
 9096                else:
 9097                    value = "'.'"
 9098                    value_samples = "'.'"
 9099                format_regex = r"[a-zA-Z0-9\s]"
 9100                sql_update_set.append(
 9101                    f"""
 9102                        "{sample}" = 
 9103                        concat(
 9104                            CASE
 9105                                WHEN {table_variants}."{sample}" = './.'
 9106                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9107                                ELSE {table_variants}."{sample}"
 9108                            END,
 9109                            ':',
 9110                            {value},
 9111                            ':',
 9112                            {value_samples}
 9113                        )
 9114                    """
 9115                )
 9116
 9117            sql_update_set_join = ", ".join(sql_update_set)
 9118            sql_update = f"""
 9119                UPDATE {table_variants}
 9120                SET {sql_update_set_join}
 9121                FROM dataframe_barcode
 9122                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9123            """
 9124            self.conn.execute(sql_update)
 9125
 9126            # Remove added columns
 9127            for added_column in added_columns:
 9128                self.drop_column(column=added_column)
 9129
 9130            # Delete dataframe
 9131            del dataframe_barcode
 9132            gc.collect()
 9133
 9134    def calculation_trio(self) -> None:
 9135        """
 9136        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9137        information to the INFO field of each variant.
 9138        """
 9139
 9140        # if FORMAT and samples
 9141        if (
 9142            "FORMAT" in self.get_header_columns_as_list()
 9143            and self.get_header_sample_list()
 9144        ):
 9145
 9146            # trio annotation field
 9147            trio_tag = "trio"
 9148
 9149            # VCF infos tags
 9150            vcf_infos_tags = {
 9151                "trio": "trio calculation",
 9152            }
 9153
 9154            # Param
 9155            param = self.get_param()
 9156
 9157            # Prefix
 9158            prefix = self.get_explode_infos_prefix()
 9159
 9160            # Trio param
 9161            trio_ped = (
 9162                param.get("calculation", {})
 9163                .get("calculations", {})
 9164                .get("TRIO", {})
 9165                .get("trio_pedigree", None)
 9166            )
 9167
 9168            # Load trio
 9169            if trio_ped:
 9170
 9171                # Trio pedigree is a file
 9172                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9173                    log.debug("TRIO pedigree is file")
 9174                    with open(full_path(trio_ped)) as trio_ped:
 9175                        trio_ped = json.load(trio_ped)
 9176
 9177                # Trio pedigree is a string
 9178                elif isinstance(trio_ped, str):
 9179                    log.debug("TRIO pedigree is str")
 9180                    try:
 9181                        trio_ped = json.loads(trio_ped)
 9182                        log.debug("TRIO pedigree is json str")
 9183                    except ValueError as e:
 9184                        trio_samples = trio_ped.split(",")
 9185                        if len(trio_samples) == 3:
 9186                            trio_ped = {
 9187                                "father": trio_samples[0],
 9188                                "mother": trio_samples[1],
 9189                                "child": trio_samples[2],
 9190                            }
 9191                            log.debug("TRIO pedigree is list str")
 9192                        else:
 9193                            msg_error = "TRIO pedigree not well formatted"
 9194                            log.error(msg_error)
 9195                            raise ValueError(msg_error)
 9196
 9197                # Trio pedigree is a dict
 9198                elif isinstance(trio_ped, dict):
 9199                    log.debug("TRIO pedigree is dict")
 9200
 9201                # Trio pedigree is not well formatted
 9202                else:
 9203                    msg_error = "TRIO pedigree not well formatted"
 9204                    log.error(msg_error)
 9205                    raise ValueError(msg_error)
 9206
 9207                # Construct trio list
 9208                trio_samples = [
 9209                    trio_ped.get("father", ""),
 9210                    trio_ped.get("mother", ""),
 9211                    trio_ped.get("child", ""),
 9212                ]
 9213
 9214            else:
 9215                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9216                samples_list = self.get_header_sample_list()
 9217                if len(samples_list) >= 3:
 9218                    trio_samples = self.get_header_sample_list()[0:3]
 9219                    trio_ped = {
 9220                        "father": trio_samples[0],
 9221                        "mother": trio_samples[1],
 9222                        "child": trio_samples[2],
 9223                    }
 9224                else:
 9225                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9226                    log.error(msg_error)
 9227                    raise ValueError(msg_error)
 9228
 9229            # Check trio pedigree
 9230            if not trio_ped or len(trio_ped) != 3:
 9231                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9232                log.error(msg_error)
 9233                raise ValueError(msg_error)
 9234
 9235            # Log
 9236            log.info(
 9237                f"Calculation 'TRIO' - Samples: "
 9238                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9239            )
 9240
 9241            # Field
 9242            trio_infos = prefix + trio_tag
 9243
 9244            # Variants table
 9245            table_variants = self.get_table_variants()
 9246
 9247            # Header
 9248            vcf_reader = self.get_header()
 9249
 9250            # Create variant id
 9251            variant_id_column = self.get_variant_id_column()
 9252            added_columns = [variant_id_column]
 9253
 9254            # variant_id, FORMAT and samples
 9255            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9256                self.get_header_sample_list()
 9257            )
 9258
 9259            # Create dataframe
 9260            dataframe_trio = self.get_query_to_df(
 9261                f""" SELECT {samples_fields} FROM {table_variants} """
 9262            )
 9263
 9264            # Create trio column
 9265            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9266                lambda row: trio(row, samples=trio_samples), axis=1
 9267            )
 9268
 9269            # Add trio to header
 9270            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9271                trio_tag,
 9272                ".",
 9273                "String",
 9274                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9275                "howard calculation",
 9276                "0",
 9277                self.code_type_map.get("String"),
 9278            )
 9279
 9280            # Update
 9281            sql_update = f"""
 9282                UPDATE {table_variants}
 9283                SET "INFO" = 
 9284                    concat(
 9285                        CASE
 9286                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9287                            THEN ''
 9288                            ELSE concat("INFO", ';')
 9289                        END,
 9290                        CASE
 9291                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9292                             AND dataframe_trio."{trio_infos}" NOT NULL
 9293                            THEN concat(
 9294                                    '{trio_tag}=',
 9295                                    dataframe_trio."{trio_infos}"
 9296                                )
 9297                            ELSE ''
 9298                        END
 9299                    )
 9300                FROM dataframe_trio
 9301                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9302            """
 9303            self.conn.execute(sql_update)
 9304
 9305            # Remove added columns
 9306            for added_column in added_columns:
 9307                self.drop_column(column=added_column)
 9308
 9309            # Delete dataframe
 9310            del dataframe_trio
 9311            gc.collect()
 9312
 9313    def calculation_vaf_normalization(self) -> None:
 9314        """
 9315        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9316        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9317        :return: The function does not return anything.
 9318        """
 9319
 9320        # if FORMAT and samples
 9321        if (
 9322            "FORMAT" in self.get_header_columns_as_list()
 9323            and self.get_header_sample_list()
 9324        ):
 9325
 9326            # vaf_normalization annotation field
 9327            vaf_normalization_tag = "VAF"
 9328
 9329            # VCF infos tags
 9330            vcf_infos_tags = {
 9331                "VAF": "VAF Variant Frequency",
 9332            }
 9333
 9334            # Prefix
 9335            prefix = self.get_explode_infos_prefix()
 9336
 9337            # Variants table
 9338            table_variants = self.get_table_variants()
 9339
 9340            # Header
 9341            vcf_reader = self.get_header()
 9342
 9343            # Do not calculate if VAF already exists
 9344            if "VAF" in vcf_reader.formats:
 9345                log.debug("VAF already on genotypes")
 9346                return
 9347
 9348            # Create variant id
 9349            variant_id_column = self.get_variant_id_column()
 9350            added_columns = [variant_id_column]
 9351
 9352            # variant_id, FORMAT and samples
 9353            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9354                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9355            )
 9356
 9357            # Create dataframe
 9358            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9359            log.debug(f"query={query}")
 9360            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9361
 9362            vaf_normalization_set = []
 9363
 9364            # for each sample vaf_normalization
 9365            for sample in self.get_header_sample_list():
 9366                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9367                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9368                )
 9369                vaf_normalization_set.append(
 9370                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9371                )
 9372
 9373            # Add VAF to FORMAT
 9374            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9375                "FORMAT"
 9376            ].apply(lambda x: str(x) + ":VAF")
 9377            vaf_normalization_set.append(
 9378                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9379            )
 9380
 9381            # Add vaf_normalization to header
 9382            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9383                id=vaf_normalization_tag,
 9384                num="1",
 9385                type="Float",
 9386                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9387                type_code=self.code_type_map.get("Float"),
 9388            )
 9389
 9390            # Create fields to add in INFO
 9391            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9392
 9393            # Update
 9394            sql_update = f"""
 9395                UPDATE {table_variants}
 9396                SET {sql_vaf_normalization_set}
 9397                FROM dataframe_vaf_normalization
 9398                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9399
 9400            """
 9401            self.conn.execute(sql_update)
 9402
 9403            # Remove added columns
 9404            for added_column in added_columns:
 9405                self.drop_column(column=added_column)
 9406
 9407            # Delete dataframe
 9408            del dataframe_vaf_normalization
 9409            gc.collect()
 9410
 9411    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9412        """
 9413        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9414        field in a VCF file and updates the INFO column of the variants table with the calculated
 9415        statistics.
 9416
 9417        :param info: The `info` parameter is a string that represents the type of information for which
 9418        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9419        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9420        maximum value, the mean, the median, defaults to VAF
 9421        :type info: str (optional)
 9422        """
 9423
 9424        # if FORMAT and samples
 9425        if (
 9426            "FORMAT" in self.get_header_columns_as_list()
 9427            and self.get_header_sample_list()
 9428        ):
 9429
 9430            # vaf_stats annotation field
 9431            vaf_stats_tag = info + "_stats"
 9432
 9433            # VCF infos tags
 9434            vcf_infos_tags = {
 9435                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9436                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9437                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9438                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9439                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9440                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9441                info
 9442                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9443            }
 9444
 9445            # Prefix
 9446            prefix = self.get_explode_infos_prefix()
 9447
 9448            # Field
 9449            vaf_stats_infos = prefix + vaf_stats_tag
 9450
 9451            # Variants table
 9452            table_variants = self.get_table_variants()
 9453
 9454            # Header
 9455            vcf_reader = self.get_header()
 9456
 9457            # Create variant id
 9458            variant_id_column = self.get_variant_id_column()
 9459            added_columns = [variant_id_column]
 9460
 9461            # variant_id, FORMAT and samples
 9462            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9463                self.get_header_sample_list()
 9464            )
 9465
 9466            # Create dataframe
 9467            dataframe_vaf_stats = self.get_query_to_df(
 9468                f""" SELECT {samples_fields} FROM {table_variants} """
 9469            )
 9470
 9471            # Create vaf_stats column
 9472            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9473                lambda row: genotype_stats(
 9474                    row, samples=self.get_header_sample_list(), info=info
 9475                ),
 9476                axis=1,
 9477            )
 9478
 9479            # List of vcf tags
 9480            sql_vaf_stats_fields = []
 9481
 9482            # Check all VAF stats infos
 9483            for stat in vcf_infos_tags:
 9484
 9485                # Extract stats
 9486                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9487                    lambda x: dict(x).get(stat, "")
 9488                )
 9489
 9490                # Add snpeff_hgvs to header
 9491                vcf_reader.infos[stat] = vcf.parser._Info(
 9492                    stat,
 9493                    ".",
 9494                    "String",
 9495                    vcf_infos_tags.get(stat, "genotype statistics"),
 9496                    "howard calculation",
 9497                    "0",
 9498                    self.code_type_map.get("String"),
 9499                )
 9500
 9501                if len(sql_vaf_stats_fields):
 9502                    sep = ";"
 9503                else:
 9504                    sep = ""
 9505
 9506                # Create fields to add in INFO
 9507                sql_vaf_stats_fields.append(
 9508                    f"""
 9509                        CASE
 9510                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9511                            THEN concat(
 9512                                    '{sep}{stat}=',
 9513                                    dataframe_vaf_stats."{stat}"
 9514                                )
 9515                            ELSE ''
 9516                        END
 9517                    """
 9518                )
 9519
 9520            # SQL set for update
 9521            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9522
 9523            # Update
 9524            sql_update = f"""
 9525                UPDATE {table_variants}
 9526                SET "INFO" = 
 9527                    concat(
 9528                        CASE
 9529                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9530                            THEN ''
 9531                            ELSE concat("INFO", ';')
 9532                        END,
 9533                        {sql_vaf_stats_fields_set}
 9534                    )
 9535                FROM dataframe_vaf_stats
 9536                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9537
 9538            """
 9539            self.conn.execute(sql_update)
 9540
 9541            # Remove added columns
 9542            for added_column in added_columns:
 9543                self.drop_column(column=added_column)
 9544
 9545            # Delete dataframe
 9546            del dataframe_vaf_stats
 9547            gc.collect()
 9548
 9549    def calculation_transcripts_annotation(
 9550        self, info_json: str = None, info_format: str = None
 9551    ) -> None:
 9552        """
 9553        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9554        field to it if transcripts are available.
 9555
 9556        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9557        is a string parameter that represents the information field to be used in the transcripts JSON.
 9558        It is used to specify the JSON format for the transcripts information. If no value is provided
 9559        when calling the method, it defaults to "
 9560        :type info_json: str
 9561        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9562        method is a string parameter that specifies the format of the information field to be used in
 9563        the transcripts JSON. It is used to define the format of the information field
 9564        :type info_format: str
 9565        """
 9566
 9567        # Create transcripts table
 9568        transcripts_table = self.create_transcript_view()
 9569
 9570        # Add info field
 9571        if transcripts_table:
 9572            self.transcript_view_to_variants(
 9573                transcripts_table=transcripts_table,
 9574                transcripts_info_field_json=info_json,
 9575                transcripts_info_field_format=info_format,
 9576            )
 9577        else:
 9578            log.info("No Transcripts to process. Check param.json file configuration")
 9579
 9580    def calculation_transcripts_prioritization(self) -> None:
 9581        """
 9582        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9583        prioritizes transcripts based on certain criteria.
 9584        """
 9585
 9586        # Create transcripts table
 9587        transcripts_table = self.create_transcript_view()
 9588
 9589        # Add info field
 9590        if transcripts_table:
 9591            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9592        else:
 9593            log.info("No Transcripts to process. Check param.json file configuration")
 9594
 9595    ###############
 9596    # Transcripts #
 9597    ###############
 9598
 9599    def transcripts_prioritization(
 9600        self, transcripts_table: str = None, param: dict = {}
 9601    ) -> bool:
 9602        """
 9603        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9604        and updates the variants table with the prioritized information.
 9605
 9606        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9607        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9608        This parameter is used to identify the table where the transcripts data is stored for the
 9609        prioritization process
 9610        :type transcripts_table: str
 9611        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9612        that contains various configuration settings for the prioritization process of transcripts. It
 9613        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9614        the prefix for prioritization fields, default profiles, and other
 9615        :type param: dict
 9616        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9617        transcripts prioritization process is successfully completed, and `False` if there are any
 9618        issues or if no profile is defined for transcripts prioritization.
 9619        """
 9620
 9621        log.debug("Start transcripts prioritization...")
 9622
 9623        # Param
 9624        if not param:
 9625            param = self.get_param()
 9626
 9627        # Variants table
 9628        table_variants = self.get_table_variants()
 9629        log.debug(f"transcripts_table={transcripts_table}")
 9630        # Transcripts table
 9631        if transcripts_table is None:
 9632            log.debug(f"transcripts_table={transcripts_table}")
 9633            transcripts_table = self.create_transcript_view(
 9634                transcripts_table="transcripts", param=param
 9635            )
 9636            log.debug(f"transcripts_table={transcripts_table}")
 9637        if transcripts_table is None:
 9638            msg_err = "No Transcripts table availalble"
 9639            log.error(msg_err)
 9640            raise ValueError(msg_err)
 9641
 9642        # Get transcripts columns
 9643        columns_as_list_query = f"""
 9644            DESCRIBE {transcripts_table}
 9645        """
 9646        columns_as_list = list(
 9647            self.get_query_to_df(columns_as_list_query)["column_name"]
 9648        )
 9649
 9650        # Create INFO if not exists
 9651        if "INFO" not in columns_as_list:
 9652            query_add_info = f"""
 9653                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9654            """
 9655            self.execute_query(query_add_info)
 9656
 9657        # Prioritization param and Force only PZ Score and Flag
 9658        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9659        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9660        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9661        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9662        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9663        pz_profile_default = (
 9664            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9665        )
 9666
 9667        # Exit if no profile
 9668        if pz_profile_default is None:
 9669            log.warning("No profile defined for transcripts prioritization")
 9670            return False
 9671
 9672        # Prioritization
 9673        prioritization_result = self.prioritization(
 9674            table=transcripts_table,
 9675            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9676        )
 9677        if not prioritization_result:
 9678            log.warning("Transcripts prioritization not processed")
 9679            return False
 9680
 9681        # Explode PZ fields
 9682        self.explode_infos(
 9683            table=transcripts_table,
 9684            fields=param.get("transcripts", {})
 9685            .get("prioritization", {})
 9686            .get("pzfields", []),
 9687        )
 9688
 9689        # Export Transcripts prioritization infos to variants table
 9690        query_update = f"""
 9691            WITH RankedTranscripts AS (
 9692                SELECT
 9693                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9694                    ROW_NUMBER() OVER (
 9695                        PARTITION BY "#CHROM", POS, REF, ALT
 9696                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9697                    ) AS rn
 9698                FROM
 9699                    {transcripts_table}
 9700            )
 9701            UPDATE {table_variants}
 9702                SET
 9703                INFO = CONCAT(CASE
 9704                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9705                            THEN ''
 9706                            ELSE concat("INFO", ';')
 9707                        END,
 9708                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9709                        )
 9710            FROM
 9711                RankedTranscripts
 9712            WHERE
 9713                rn = 1
 9714                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9715                AND variants."POS" = RankedTranscripts."POS"
 9716                AND variants."REF" = RankedTranscripts."REF"
 9717                AND variants."ALT" = RankedTranscripts."ALT"
 9718                
 9719        """
 9720        self.execute_query(query=query_update)
 9721
 9722        # Add PZ Transcript in header
 9723        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9724            pz_fields_transcripts,
 9725            ".",
 9726            "String",
 9727            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9728            "unknown",
 9729            "unknown",
 9730            code_type_map["String"],
 9731        )
 9732
 9733        # Return
 9734        return True
 9735
 9736    def create_transcript_view_from_columns_map(
 9737        self,
 9738        transcripts_table: str = "transcripts",
 9739        columns_maps: dict = {},
 9740        added_columns: list = [],
 9741        temporary_tables: list = None,
 9742        annotation_fields: list = None,
 9743    ) -> tuple[list, list, list]:
 9744        """
 9745        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9746        specified columns mapping for transcripts data.
 9747
 9748        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9749        the table where the transcripts data is stored or will be stored in the database. This table
 9750        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9751        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9752        :type transcripts_table: str (optional)
 9753        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9754        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9755        represents a mapping configuration for a specific set of columns. It typically includes details such
 9756        as the main transcript column and additional information columns
 9757        :type columns_maps: dict
 9758        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9759        function is a list that stores the additional columns that will be added to the view being created
 9760        based on the columns map provided. These columns are generated by exploding the transcript
 9761        information columns along with the main transcript column
 9762        :type added_columns: list
 9763        :param temporary_tables: The `temporary_tables` parameter in the
 9764        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9765        tables created during the process of creating a transcript view from a columns map. These temporary
 9766        tables are used to store intermediate results or transformations before the final view is generated
 9767        :type temporary_tables: list
 9768        :param annotation_fields: The `annotation_fields` parameter in the
 9769        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9770        for annotation in the query view creation process. These fields are extracted from the
 9771        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9772        :type annotation_fields: list
 9773        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9774        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9775        """
 9776
 9777        log.debug("Start transcrpts view creation from columns map...")
 9778
 9779        # "from_columns_map": [
 9780        #     {
 9781        #         "transcripts_column": "Ensembl_transcriptid",
 9782        #         "transcripts_infos_columns": [
 9783        #             "genename",
 9784        #             "Ensembl_geneid",
 9785        #             "LIST_S2_score",
 9786        #             "LIST_S2_pred",
 9787        #         ],
 9788        #     },
 9789        #     {
 9790        #         "transcripts_column": "Ensembl_transcriptid",
 9791        #         "transcripts_infos_columns": [
 9792        #             "genename",
 9793        #             "VARITY_R_score",
 9794        #             "Aloft_pred",
 9795        #         ],
 9796        #     },
 9797        # ],
 9798
 9799        # Init
 9800        if temporary_tables is None:
 9801            temporary_tables = []
 9802        if annotation_fields is None:
 9803            annotation_fields = []
 9804
 9805        # Variants table
 9806        table_variants = self.get_table_variants()
 9807
 9808        for columns_map in columns_maps:
 9809
 9810            # Transcript column
 9811            transcripts_column = columns_map.get("transcripts_column", None)
 9812
 9813            # Transcripts infos columns
 9814            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9815
 9816            if transcripts_column is not None:
 9817
 9818                # Explode
 9819                added_columns += self.explode_infos(
 9820                    fields=[transcripts_column] + transcripts_infos_columns
 9821                )
 9822
 9823                # View clauses
 9824                clause_select = []
 9825                for field in [transcripts_column] + transcripts_infos_columns:
 9826                    clause_select.append(
 9827                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9828                    )
 9829                    if field not in [transcripts_column]:
 9830                        annotation_fields.append(field)
 9831
 9832                # Querey View
 9833                query = f""" 
 9834                    SELECT
 9835                        "#CHROM", POS, REF, ALT, INFO,
 9836                        "{transcripts_column}" AS 'transcript',
 9837                        {", ".join(clause_select)}
 9838                    FROM (
 9839                        SELECT 
 9840                            "#CHROM", POS, REF, ALT, INFO,
 9841                            {", ".join(clause_select)}
 9842                        FROM {table_variants}
 9843                        )
 9844                    WHERE "{transcripts_column}" IS NOT NULL
 9845                """
 9846
 9847                # Create temporary table
 9848                temporary_table = transcripts_table + "".join(
 9849                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9850                )
 9851
 9852                # Temporary_tables
 9853                temporary_tables.append(temporary_table)
 9854                query_view = f"""
 9855                    CREATE TEMPORARY TABLE {temporary_table}
 9856                    AS ({query})
 9857                """
 9858                self.execute_query(query=query_view)
 9859
 9860        return added_columns, temporary_tables, annotation_fields
 9861
 9862    def create_transcript_view_from_column_format(
 9863        self,
 9864        transcripts_table: str = "transcripts",
 9865        column_formats: dict = {},
 9866        temporary_tables: list = None,
 9867        annotation_fields: list = None,
 9868    ) -> tuple[list, list, list]:
 9869        """
 9870        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9871        specified column formats, adds additional columns and annotation fields, and returns the list of
 9872        temporary tables and annotation fields.
 9873
 9874        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9875        the table containing the transcripts data. This table will be used as the base table for creating
 9876        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9877        different table name if needed, defaults to transcripts
 9878        :type transcripts_table: str (optional)
 9879        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9880        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9881        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9882        the provided code snippet:
 9883        :type column_formats: dict
 9884        :param temporary_tables: The `temporary_tables` parameter in the
 9885        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9886        views created during the process of creating a transcript view from a column format. These temporary
 9887        views are used to manipulate and extract data before generating the final transcript view. It
 9888        :type temporary_tables: list
 9889        :param annotation_fields: The `annotation_fields` parameter in the
 9890        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9891        that are extracted from the temporary views created during the process. These annotation fields are
 9892        obtained by querying the temporary views and extracting the column names excluding specific columns
 9893        like `#CH
 9894        :type annotation_fields: list
 9895        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9896        `temporary_tables` and `annotation_fields`.
 9897        """
 9898
 9899        log.debug("Start transcrpts view creation from column format...")
 9900
 9901        #  "from_column_format": [
 9902        #     {
 9903        #         "transcripts_column": "ANN",
 9904        #         "transcripts_infos_column": "Feature_ID",
 9905        #     }
 9906        # ],
 9907
 9908        # Init
 9909        if temporary_tables is None:
 9910            temporary_tables = []
 9911        if annotation_fields is None:
 9912            annotation_fields = []
 9913
 9914        for column_format in column_formats:
 9915
 9916            # annotation field and transcript annotation field
 9917            annotation_field = column_format.get("transcripts_column", "ANN")
 9918            transcript_annotation = column_format.get(
 9919                "transcripts_infos_column", "Feature_ID"
 9920            )
 9921
 9922            # Temporary View name
 9923            temporary_view_name = transcripts_table + "".join(
 9924                random.choices(string.ascii_uppercase + string.digits, k=10)
 9925            )
 9926
 9927            # Create temporary view name
 9928            temporary_view_name = self.annotation_format_to_table(
 9929                uniquify=True,
 9930                annotation_field=annotation_field,
 9931                view_name=temporary_view_name,
 9932                annotation_id=transcript_annotation,
 9933            )
 9934
 9935            # Annotation fields
 9936            if temporary_view_name:
 9937                query_annotation_fields = f"""
 9938                    SELECT *
 9939                    FROM (
 9940                        DESCRIBE SELECT *
 9941                        FROM {temporary_view_name}
 9942                        )
 9943                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9944                """
 9945                df_annotation_fields = self.get_query_to_df(
 9946                    query=query_annotation_fields
 9947                )
 9948
 9949                # Add temporary view and annotation fields
 9950                temporary_tables.append(temporary_view_name)
 9951                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9952
 9953        return temporary_tables, annotation_fields
 9954
 9955    def create_transcript_view(
 9956        self,
 9957        transcripts_table: str = None,
 9958        transcripts_table_drop: bool = True,
 9959        param: dict = {},
 9960    ) -> str:
 9961        """
 9962        The `create_transcript_view` function generates a transcript view by processing data from a
 9963        specified table based on provided parameters and structural information.
 9964
 9965        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9966        is used to specify the name of the table that will store the final transcript view data. If a table
 9967        name is not provided, the function will create a new table to store the transcript view data, and by
 9968        default,, defaults to transcripts
 9969        :type transcripts_table: str (optional)
 9970        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9971        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9972        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9973        the function will drop the existing transcripts table if it exists, defaults to True
 9974        :type transcripts_table_drop: bool (optional)
 9975        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9976        contains information needed to create a transcript view. It includes details such as the structure
 9977        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9978        the view. This parameter allows for flexibility and customization
 9979        :type param: dict
 9980        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9981        created or modified during the execution of the function.
 9982        """
 9983
 9984        log.debug("Start transcripts view creation...")
 9985
 9986        # Default
 9987        transcripts_table_default = "transcripts"
 9988
 9989        # Param
 9990        if not param:
 9991            param = self.get_param()
 9992
 9993        # Struct
 9994        struct = param.get("transcripts", {}).get("struct", None)
 9995
 9996        if struct:
 9997
 9998            # Transcripts table
 9999            if transcripts_table is None:
10000                transcripts_table = param.get("transcripts", {}).get(
10001                    "table", transcripts_table_default
10002                )
10003
10004            # added_columns
10005            added_columns = []
10006
10007            # Temporary tables
10008            temporary_tables = []
10009
10010            # Annotation fields
10011            annotation_fields = []
10012
10013            # from columns map
10014            columns_maps = struct.get("from_columns_map", [])
10015            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10016                self.create_transcript_view_from_columns_map(
10017                    transcripts_table=transcripts_table,
10018                    columns_maps=columns_maps,
10019                    added_columns=added_columns,
10020                    temporary_tables=temporary_tables,
10021                    annotation_fields=annotation_fields,
10022                )
10023            )
10024            added_columns += added_columns_tmp
10025            temporary_tables += temporary_tables_tmp
10026            annotation_fields += annotation_fields_tmp
10027
10028            # from column format
10029            column_formats = struct.get("from_column_format", [])
10030            temporary_tables_tmp, annotation_fields_tmp = (
10031                self.create_transcript_view_from_column_format(
10032                    transcripts_table=transcripts_table,
10033                    column_formats=column_formats,
10034                    temporary_tables=temporary_tables,
10035                    annotation_fields=annotation_fields,
10036                )
10037            )
10038            temporary_tables += temporary_tables_tmp
10039            annotation_fields += annotation_fields_tmp
10040
10041            # Merge temporary tables query
10042            query_merge = ""
10043            for temporary_table in temporary_tables:
10044
10045                # First temporary table
10046                if not query_merge:
10047                    query_merge = f"""
10048                        SELECT * FROM {temporary_table}
10049                    """
10050                # other temporary table (using UNION)
10051                else:
10052                    query_merge += f"""
10053                        UNION BY NAME SELECT * FROM {temporary_table}
10054                    """
10055
10056            # Merge on transcript
10057            query_merge_on_transcripts_annotation_fields = []
10058            # Aggregate all annotations fields
10059            for annotation_field in set(annotation_fields):
10060                query_merge_on_transcripts_annotation_fields.append(
10061                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10062                )
10063            # Query for transcripts view
10064            query_merge_on_transcripts = f"""
10065                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10066                FROM ({query_merge})
10067                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10068            """
10069
10070            # Drop transcript view is necessary
10071            if transcripts_table_drop:
10072                query_drop = f"""
10073                    DROP TABLE IF EXISTS {transcripts_table};
10074                """
10075                self.execute_query(query=query_drop)
10076
10077            # Merge and create transcript view
10078            query_create_view = f"""
10079                CREATE TABLE IF NOT EXISTS {transcripts_table}
10080                AS {query_merge_on_transcripts}
10081            """
10082            self.execute_query(query=query_create_view)
10083
10084            # Remove added columns
10085            for added_column in added_columns:
10086                self.drop_column(column=added_column)
10087
10088        else:
10089
10090            transcripts_table = None
10091
10092        return transcripts_table
10093
10094    def annotation_format_to_table(
10095        self,
10096        uniquify: bool = True,
10097        annotation_field: str = "ANN",
10098        annotation_id: str = "Feature_ID",
10099        view_name: str = "transcripts",
10100    ) -> str:
10101        """
10102        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10103        table format.
10104
10105        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10106        values in the output or not. If set to `True`, the function will make sure that the output values
10107        are unique, defaults to True
10108        :type uniquify: bool (optional)
10109        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10110        contains the annotation information for each variant. This field is used to extract the annotation
10111        details for further processing in the function, defaults to ANN
10112        :type annotation_field: str (optional)
10113        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10114        used to specify the identifier for the annotation feature. This identifier will be used as a column
10115        name in the resulting table or view that is created based on the annotation data. It helps in
10116        uniquely identifying each annotation entry in the, defaults to Feature_ID
10117        :type annotation_id: str (optional)
10118        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10119        specify the name of the temporary table that will be created to store the transformed annotation
10120        data. This table will hold the extracted information from the annotation field in a structured
10121        format for further processing or analysis, defaults to transcripts
10122        :type view_name: str (optional)
10123        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10124        is stored in the variable `view_name`.
10125        """
10126
10127        # Annotation field
10128        annotation_format = "annotation_explode"
10129
10130        # Transcript annotation
10131        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10132
10133        # Prefix
10134        prefix = self.get_explode_infos_prefix()
10135        if prefix:
10136            prefix = "INFO/"
10137
10138        # Annotation fields
10139        annotation_infos = prefix + annotation_field
10140        annotation_format_infos = prefix + annotation_format
10141
10142        # Variants table
10143        table_variants = self.get_table_variants()
10144
10145        # Header
10146        vcf_reader = self.get_header()
10147
10148        # Add columns
10149        added_columns = []
10150
10151        # Explode HGVS field in column
10152        added_columns += self.explode_infos(fields=[annotation_field])
10153
10154        if annotation_field in vcf_reader.infos:
10155
10156            # Extract ANN header
10157            ann_description = vcf_reader.infos[annotation_field].desc
10158            pattern = r"'(.+?)'"
10159            match = re.search(pattern, ann_description)
10160            if match:
10161                ann_header_match = match.group(1).split(" | ")
10162                ann_header = []
10163                ann_header_desc = {}
10164                for i in range(len(ann_header_match)):
10165                    ann_header_info = "".join(
10166                        char for char in ann_header_match[i] if char.isalnum()
10167                    )
10168                    ann_header.append(ann_header_info)
10169                    ann_header_desc[ann_header_info] = ann_header_match[i]
10170                if not ann_header_desc:
10171                    raise ValueError("Invalid header description format")
10172            else:
10173                raise ValueError("Invalid header description format")
10174
10175            # Create variant id
10176            variant_id_column = self.get_variant_id_column()
10177            added_columns += [variant_id_column]
10178
10179            # Create dataframe
10180            dataframe_annotation_format = self.get_query_to_df(
10181                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10182            )
10183
10184            # Create annotation columns
10185            dataframe_annotation_format[
10186                annotation_format_infos
10187            ] = dataframe_annotation_format[annotation_infos].apply(
10188                lambda x: explode_annotation_format(
10189                    annotation=str(x),
10190                    uniquify=uniquify,
10191                    output_format="JSON",
10192                    prefix="",
10193                    header=list(ann_header_desc.values()),
10194                )
10195            )
10196
10197            # Find keys
10198            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10199            df_keys = self.get_query_to_df(query=query_json)
10200
10201            # Check keys
10202            query_json_key = []
10203            for _, row in df_keys.iterrows():
10204
10205                # Key
10206                key = row.iloc[0]
10207
10208                # key_clean
10209                key_clean = "".join(char for char in key if char.isalnum())
10210
10211                # Type
10212                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10213
10214                # Get DataFrame from query
10215                df_json_type = self.get_query_to_df(query=query_json_type)
10216
10217                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10218                with pd.option_context("future.no_silent_downcasting", True):
10219                    df_json_type.fillna(value="", inplace=True)
10220                    replace_dict = {None: np.nan, "": np.nan}
10221                    df_json_type.replace(replace_dict, inplace=True)
10222                    df_json_type.dropna(inplace=True)
10223
10224                # Detect column type
10225                column_type = detect_column_type(df_json_type[key_clean])
10226
10227                # Append
10228                query_json_key.append(
10229                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10230                )
10231
10232            # Create view
10233            query_view = f"""
10234                CREATE TEMPORARY TABLE {view_name}
10235                AS (
10236                    SELECT *, {annotation_id} AS 'transcript'
10237                    FROM (
10238                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10239                        FROM dataframe_annotation_format
10240                        )
10241                    );
10242            """
10243            self.execute_query(query=query_view)
10244
10245        else:
10246
10247            # Return None
10248            view_name = None
10249
10250        # Remove added columns
10251        for added_column in added_columns:
10252            self.drop_column(column=added_column)
10253
10254        return view_name
10255
10256    def transcript_view_to_variants(
10257        self,
10258        transcripts_table: str = None,
10259        transcripts_column_id: str = None,
10260        transcripts_info_json: str = None,
10261        transcripts_info_field_json: str = None,
10262        transcripts_info_format: str = None,
10263        transcripts_info_field_format: str = None,
10264        param: dict = {},
10265    ) -> bool:
10266        """
10267        The `transcript_view_to_variants` function updates a variants table with information from
10268        transcripts in JSON format.
10269
10270        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10271        table containing the transcripts data. If this parameter is not provided, the function will
10272        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10273        :type transcripts_table: str
10274        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10275        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10276        identifier is used to match transcripts with variants in the database
10277        :type transcripts_column_id: str
10278        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10279        of the column in the variants table where the transcripts information will be stored in JSON
10280        format. This parameter allows you to define the column in the variants table that will hold the
10281        JSON-formatted information about transcripts
10282        :type transcripts_info_json: str
10283        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10284        specify the field in the VCF header that will contain information about transcripts in JSON
10285        format. This field will be added to the VCF header as an INFO field with the specified name
10286        :type transcripts_info_field_json: str
10287        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10288        format of the information about transcripts that will be stored in the variants table. This
10289        format can be used to define how the transcript information will be structured or displayed
10290        within the variants table
10291        :type transcripts_info_format: str
10292        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10293        specify the field in the VCF header that will contain information about transcripts in a
10294        specific format. This field will be added to the VCF header as an INFO field with the specified
10295        name
10296        :type transcripts_info_field_format: str
10297        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10298        that contains various configuration settings related to transcripts. It is used to provide
10299        default values for certain parameters if they are not explicitly provided when calling the
10300        method. The `param` dictionary can be passed as an argument
10301        :type param: dict
10302        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10303        if the operation is successful and `False` if certain conditions are not met.
10304        """
10305
10306        msg_info_prefix = "Start transcripts view to variants annotations"
10307
10308        log.debug(f"{msg_info_prefix}...")
10309
10310        # Default
10311        transcripts_table_default = "transcripts"
10312        transcripts_column_id_default = "transcript"
10313        transcripts_info_json_default = None
10314        transcripts_info_format_default = None
10315        transcripts_info_field_json_default = None
10316        transcripts_info_field_format_default = None
10317
10318        # Param
10319        if not param:
10320            param = self.get_param()
10321
10322        # Transcripts table
10323        if transcripts_table is None:
10324            transcripts_table = param.get("transcripts", {}).get(
10325                "table", transcripts_table_default
10326            )
10327
10328        # Transcripts column ID
10329        if transcripts_column_id is None:
10330            transcripts_column_id = param.get("transcripts", {}).get(
10331                "column_id", transcripts_column_id_default
10332            )
10333
10334        # Transcripts info json
10335        if transcripts_info_json is None:
10336            transcripts_info_json = param.get("transcripts", {}).get(
10337                "transcripts_info_json", transcripts_info_json_default
10338            )
10339
10340        # Transcripts info field JSON
10341        if transcripts_info_field_json is None:
10342            transcripts_info_field_json = param.get("transcripts", {}).get(
10343                "transcripts_info_field_json", transcripts_info_field_json_default
10344            )
10345        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10346        #     transcripts_info_json = transcripts_info_field_json
10347
10348        # Transcripts info format
10349        if transcripts_info_format is None:
10350            transcripts_info_format = param.get("transcripts", {}).get(
10351                "transcripts_info_format", transcripts_info_format_default
10352            )
10353
10354        # Transcripts info field FORMAT
10355        if transcripts_info_field_format is None:
10356            transcripts_info_field_format = param.get("transcripts", {}).get(
10357                "transcripts_info_field_format", transcripts_info_field_format_default
10358            )
10359        # if (
10360        #     transcripts_info_field_format is not None
10361        #     and transcripts_info_format is None
10362        # ):
10363        #     transcripts_info_format = transcripts_info_field_format
10364
10365        # Variants table
10366        table_variants = self.get_table_variants()
10367
10368        # Check info columns param
10369        if (
10370            transcripts_info_json is None
10371            and transcripts_info_field_json is None
10372            and transcripts_info_format is None
10373            and transcripts_info_field_format is None
10374        ):
10375            return False
10376
10377        # Transcripts infos columns
10378        query_transcripts_infos_columns = f"""
10379            SELECT *
10380            FROM (
10381                DESCRIBE SELECT * FROM {transcripts_table}
10382                )
10383            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10384        """
10385        transcripts_infos_columns = list(
10386            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10387        )
10388
10389        # View results
10390        clause_select = []
10391        clause_to_json = []
10392        clause_to_format = []
10393        for field in transcripts_infos_columns:
10394            clause_select.append(
10395                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10396            )
10397            clause_to_json.append(f""" '{field}': "{field}" """)
10398            clause_to_format.append(f""" "{field}" """)
10399
10400        # Update
10401        update_set_json = []
10402        update_set_format = []
10403
10404        # VCF header
10405        vcf_reader = self.get_header()
10406
10407        # Transcripts to info column in JSON
10408        if transcripts_info_json is not None:
10409
10410            # Create column on variants table
10411            self.add_column(
10412                table_name=table_variants,
10413                column_name=transcripts_info_json,
10414                column_type="JSON",
10415                default_value=None,
10416                drop=False,
10417            )
10418
10419            # Add header
10420            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10421                transcripts_info_json,
10422                ".",
10423                "String",
10424                "Transcripts in JSON format",
10425                "unknwon",
10426                "unknwon",
10427                self.code_type_map["String"],
10428            )
10429
10430            # Add to update
10431            update_set_json.append(
10432                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10433            )
10434
10435        # Transcripts to info field in JSON
10436        if transcripts_info_field_json is not None:
10437
10438            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10439
10440            # Add to update
10441            update_set_json.append(
10442                f""" 
10443                    INFO = concat(
10444                            CASE
10445                                WHEN INFO NOT IN ('', '.')
10446                                THEN INFO
10447                                ELSE ''
10448                            END,
10449                            CASE
10450                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10451                                THEN concat(
10452                                    ';{transcripts_info_field_json}=',
10453                                    t.{transcripts_info_json}
10454                                )
10455                                ELSE ''
10456                            END
10457                            )
10458                """
10459            )
10460
10461            # Add header
10462            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10463                transcripts_info_field_json,
10464                ".",
10465                "String",
10466                "Transcripts in JSON format",
10467                "unknwon",
10468                "unknwon",
10469                self.code_type_map["String"],
10470            )
10471
10472        if update_set_json:
10473
10474            # Update query
10475            query_update = f"""
10476                UPDATE {table_variants}
10477                    SET {", ".join(update_set_json)}
10478                FROM
10479                (
10480                    SELECT
10481                        "#CHROM", POS, REF, ALT,
10482                            concat(
10483                            '{{',
10484                            string_agg(
10485                                '"' || "{transcripts_column_id}" || '":' ||
10486                                to_json(json_output)
10487                            ),
10488                            '}}'
10489                            )::JSON AS {transcripts_info_json}
10490                    FROM
10491                        (
10492                        SELECT
10493                            "#CHROM", POS, REF, ALT,
10494                            "{transcripts_column_id}",
10495                            to_json(
10496                                {{{",".join(clause_to_json)}}}
10497                            )::JSON AS json_output
10498                        FROM
10499                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10500                        WHERE "{transcripts_column_id}" IS NOT NULL
10501                        )
10502                    GROUP BY "#CHROM", POS, REF, ALT
10503                ) AS t
10504                WHERE {table_variants}."#CHROM" = t."#CHROM"
10505                    AND {table_variants}."POS" = t."POS"
10506                    AND {table_variants}."REF" = t."REF"
10507                    AND {table_variants}."ALT" = t."ALT"
10508            """
10509
10510            self.execute_query(query=query_update)
10511
10512        # Transcripts to info column in FORMAT
10513        if transcripts_info_format is not None:
10514
10515            # Create column on variants table
10516            self.add_column(
10517                table_name=table_variants,
10518                column_name=transcripts_info_format,
10519                column_type="VARCHAR",
10520                default_value=None,
10521                drop=False,
10522            )
10523
10524            # Add header
10525            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10526                transcripts_info_format,
10527                ".",
10528                "String",
10529                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10530                "unknwon",
10531                "unknwon",
10532                self.code_type_map["String"],
10533            )
10534
10535            # Add to update
10536            update_set_format.append(
10537                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10538            )
10539
10540        # Transcripts to info field in JSON
10541        if transcripts_info_field_format is not None:
10542
10543            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10544
10545            # Add to update
10546            update_set_format.append(
10547                f""" 
10548                    INFO = concat(
10549                            CASE
10550                                WHEN INFO NOT IN ('', '.')
10551                                THEN INFO
10552                                ELSE ''
10553                            END,
10554                            CASE
10555                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10556                                THEN concat(
10557                                    ';{transcripts_info_field_format}=',
10558                                    t.{transcripts_info_format}
10559                                )
10560                                ELSE ''
10561                            END
10562                            )
10563                """
10564            )
10565
10566            # Add header
10567            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10568                transcripts_info_field_format,
10569                ".",
10570                "String",
10571                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10572                "unknwon",
10573                "unknwon",
10574                self.code_type_map["String"],
10575            )
10576
10577        if update_set_format:
10578
10579            # Update query
10580            query_update = f"""
10581                UPDATE {table_variants}
10582                    SET {", ".join(update_set_format)}
10583                FROM
10584                (
10585                    SELECT
10586                        "#CHROM", POS, REF, ALT,
10587                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10588                    FROM 
10589                        (
10590                        SELECT
10591                            "#CHROM", POS, REF, ALT,
10592                            "{transcripts_column_id}",
10593                            concat(
10594                                "{transcripts_column_id}",
10595                                '|',
10596                                {", '|', ".join(clause_to_format)}
10597                            ) AS {transcripts_info_format}
10598                        FROM
10599                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10600                        )
10601                    GROUP BY "#CHROM", POS, REF, ALT
10602                ) AS t
10603                WHERE {table_variants}."#CHROM" = t."#CHROM"
10604                    AND {table_variants}."POS" = t."POS"
10605                    AND {table_variants}."REF" = t."REF"
10606                    AND {table_variants}."ALT" = t."ALT"
10607            """
10608
10609            self.execute_query(query=query_update)
10610
10611        return True
class Variants:
   34class Variants:
   35
   36    def __init__(
   37        self,
   38        conn=None,
   39        input: str = None,
   40        output: str = None,
   41        config: dict = {},
   42        param: dict = {},
   43        load: bool = False,
   44    ) -> None:
   45        """
   46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
   47        header
   48
   49        :param conn: the connection to the database
   50        :param input: the input file
   51        :param output: the output file
   52        :param config: a dictionary containing the configuration of the model
   53        :param param: a dictionary containing the parameters of the model
   54        """
   55
   56        # Init variables
   57        self.init_variables()
   58
   59        # Input
   60        self.set_input(input)
   61
   62        # Config
   63        self.set_config(config)
   64
   65        # Param
   66        self.set_param(param)
   67
   68        # Output
   69        self.set_output(output)
   70
   71        # connexion
   72        self.set_connexion(conn)
   73
   74        # Header
   75        self.set_header()
   76
   77        # Samples
   78        self.set_samples()
   79
   80        # Load data
   81        if load:
   82            self.load_data()
   83
   84    def set_samples(self, samples: list = None) -> list:
   85        """
   86        The function `set_samples` sets the samples attribute of an object to a provided list or
   87        retrieves it from a parameter dictionary.
   88
   89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
   90        input and sets the `samples` attribute of the class to the provided list. If no samples are
   91        provided, it tries to get the samples from the class's parameters using the `get_param` method
   92        :type samples: list
   93        :return: The `samples` list is being returned.
   94        """
   95
   96        if not samples:
   97            samples = self.get_param().get("samples", {}).get("list", None)
   98
   99        self.samples = samples
  100
  101        return samples
  102
  103    def get_samples(self) -> list:
  104        """
  105        This function returns a list of samples.
  106        :return: The `get_samples` method is returning the `samples` attribute of the object.
  107        """
  108
  109        return self.samples
  110
  111    def get_samples_check(self) -> bool:
  112        """
  113        This function returns the value of the "check" key within the "samples" dictionary retrieved
  114        from the parameters.
  115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
  116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
  117        method. If the key "check" is not found, it will return `False`.
  118        """
  119
  120        return self.get_param().get("samples", {}).get("check", True)
  121
  122    def set_input(self, input: str = None) -> None:
  123        """
  124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
  125        attributes in the class accordingly.
  126
  127        :param input: The `set_input` method in the provided code snippet is used to set attributes
  128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
  129        :type input: str
  130        """
  131
  132        if input and not isinstance(input, str):
  133            try:
  134                self.input = input.name
  135            except:
  136                log.error(f"Input file '{input} in bad format")
  137                raise ValueError(f"Input file '{input} in bad format")
  138        else:
  139            self.input = input
  140
  141        # Input format
  142        if input:
  143            input_name, input_extension = os.path.splitext(self.input)
  144            self.input_name = input_name
  145            self.input_extension = input_extension
  146            self.input_format = self.input_extension.replace(".", "")
  147
  148    def set_config(self, config: dict) -> None:
  149        """
  150        The set_config function takes a config object and assigns it as the configuration object for the
  151        class.
  152
  153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
  154        contains configuration settings for the class. When you call the `set_config` function with a
  155        dictionary object as the argument, it will set that dictionary as the configuration object for
  156        the class
  157        :type config: dict
  158        """
  159
  160        self.config = config
  161
  162    def set_param(self, param: dict) -> None:
  163        """
  164        This function sets a parameter object for the class based on the input dictionary.
  165
  166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
  167        as the `param` attribute of the class instance
  168        :type param: dict
  169        """
  170
  171        self.param = param
  172
  173    def init_variables(self) -> None:
  174        """
  175        This function initializes the variables that will be used in the rest of the class
  176        """
  177
  178        self.prefix = "howard"
  179        self.table_variants = "variants"
  180        self.dataframe = None
  181
  182        self.comparison_map = {
  183            "gt": ">",
  184            "gte": ">=",
  185            "lt": "<",
  186            "lte": "<=",
  187            "equals": "=",
  188            "contains": "SIMILAR TO",
  189        }
  190
  191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
  192
  193        self.code_type_map_to_sql = {
  194            "Integer": "INTEGER",
  195            "String": "VARCHAR",
  196            "Float": "FLOAT",
  197            "Flag": "VARCHAR",
  198        }
  199
  200        self.index_additionnal_fields = []
  201
  202    def get_indexing(self) -> bool:
  203        """
  204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
  205        returns False.
  206        :return: The value of the indexing parameter.
  207        """
  208
  209        return self.get_param().get("indexing", False)
  210
  211    def get_connexion_config(self) -> dict:
  212        """
  213        The function `get_connexion_config` returns a dictionary containing the configuration for a
  214        connection, including the number of threads and memory limit.
  215        :return: a dictionary containing the configuration for the Connexion library.
  216        """
  217
  218        # config
  219        config = self.get_config()
  220
  221        # Connexion config
  222        connexion_config = {}
  223        threads = self.get_threads()
  224
  225        # Threads
  226        if threads:
  227            connexion_config["threads"] = threads
  228
  229        # Memory
  230        # if config.get("memory", None):
  231        #     connexion_config["memory_limit"] = config.get("memory")
  232        if self.get_memory():
  233            connexion_config["memory_limit"] = self.get_memory()
  234
  235        # Temporary directory
  236        if config.get("tmp", None):
  237            connexion_config["temp_directory"] = config.get("tmp")
  238
  239        # Access
  240        if config.get("access", None):
  241            access = config.get("access")
  242            if access in ["RO"]:
  243                access = "READ_ONLY"
  244            elif access in ["RW"]:
  245                access = "READ_WRITE"
  246            connexion_db = self.get_connexion_db()
  247            if connexion_db in ":memory:":
  248                access = "READ_WRITE"
  249            connexion_config["access_mode"] = access
  250
  251        return connexion_config
  252
  253    def get_duckdb_settings(self) -> dict:
  254        """
  255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
  256        string.
  257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
  258        """
  259
  260        # config
  261        config = self.get_config()
  262
  263        # duckdb settings
  264        duckdb_settings_dict = {}
  265        if config.get("duckdb_settings", None):
  266            duckdb_settings = config.get("duckdb_settings")
  267            duckdb_settings = full_path(duckdb_settings)
  268            # duckdb setting is a file
  269            if os.path.exists(duckdb_settings):
  270                with open(duckdb_settings) as json_file:
  271                    duckdb_settings_dict = yaml.safe_load(json_file)
  272            # duckdb settings is a string
  273            else:
  274                duckdb_settings_dict = json.loads(duckdb_settings)
  275
  276        return duckdb_settings_dict
  277
  278    def set_connexion_db(self) -> str:
  279        """
  280        The function `set_connexion_db` returns the appropriate database connection string based on the
  281        input format and connection type.
  282        :return: the value of the variable `connexion_db`.
  283        """
  284
  285        # Default connexion db
  286        default_connexion_db = ":memory:"
  287
  288        # Find connexion db
  289        if self.get_input_format() in ["db", "duckdb"]:
  290            connexion_db = self.get_input()
  291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
  292            connexion_db = default_connexion_db
  293        elif self.get_connexion_type() in ["tmpfile"]:
  294            tmp_name = tempfile.mkdtemp(
  295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
  296            )
  297            connexion_db = f"{tmp_name}/tmp.db"
  298        elif self.get_connexion_type() != "":
  299            connexion_db = self.get_connexion_type()
  300        else:
  301            connexion_db = default_connexion_db
  302
  303        # Set connexion db
  304        self.connexion_db = connexion_db
  305
  306        return connexion_db
  307
  308    def set_connexion(self, conn) -> None:
  309        """
  310        The function `set_connexion` creates a connection to a database, with options for different
  311        database formats and settings.
  312
  313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
  314        database. If a connection is not provided, a new connection to an in-memory database is created.
  315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
  316        sqlite
  317        """
  318
  319        # Connexion db
  320        connexion_db = self.set_connexion_db()
  321
  322        # Connexion config
  323        connexion_config = self.get_connexion_config()
  324
  325        # Connexion format
  326        connexion_format = self.get_config().get("connexion_format", "duckdb")
  327        # Set connexion format
  328        self.connexion_format = connexion_format
  329
  330        # Connexion
  331        if not conn:
  332            if connexion_format in ["duckdb"]:
  333                conn = duckdb.connect(connexion_db, config=connexion_config)
  334                # duckDB settings
  335                duckdb_settings = self.get_duckdb_settings()
  336                if duckdb_settings:
  337                    for setting in duckdb_settings:
  338                        setting_value = duckdb_settings.get(setting)
  339                        if isinstance(setting_value, str):
  340                            setting_value = f"'{setting_value}'"
  341                        conn.execute(f"PRAGMA {setting}={setting_value};")
  342            elif connexion_format in ["sqlite"]:
  343                conn = sqlite3.connect(connexion_db)
  344
  345        # Set connexion
  346        self.conn = conn
  347
  348        # Log
  349        log.debug(f"connexion_format: {connexion_format}")
  350        log.debug(f"connexion_db: {connexion_db}")
  351        log.debug(f"connexion config: {connexion_config}")
  352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")
  353
  354    def set_output(self, output: str = None) -> None:
  355        """
  356        The `set_output` function in Python sets the output file based on the input or a specified key
  357        in the config file, extracting the output name, extension, and format.
  358
  359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
  360        the output file. If the config file has an 'output' key, the method sets the output to the value
  361        of that key. If no output is provided, it sets the output to `None`
  362        :type output: str
  363        """
  364
  365        if output and not isinstance(output, str):
  366            self.output = output.name
  367        else:
  368            self.output = output
  369
  370        # Output format
  371        if self.output:
  372            output_name, output_extension = os.path.splitext(self.output)
  373            self.output_name = output_name
  374            self.output_extension = output_extension
  375            self.output_format = self.output_extension.replace(".", "")
  376        else:
  377            self.output_name = None
  378            self.output_extension = None
  379            self.output_format = None
  380
  381    def set_header(self) -> None:
  382        """
  383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
  384        """
  385
  386        input_file = self.get_input()
  387        default_header_list = [
  388            "##fileformat=VCFv4.2",
  389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
  390        ]
  391
  392        # Full path
  393        input_file = full_path(input_file)
  394
  395        if input_file:
  396
  397            input_format = self.get_input_format()
  398            input_compressed = self.get_input_compressed()
  399            config = self.get_config()
  400            header_list = default_header_list
  401            if input_format in [
  402                "vcf",
  403                "hdr",
  404                "tsv",
  405                "csv",
  406                "psv",
  407                "parquet",
  408                "db",
  409                "duckdb",
  410            ]:
  411                # header provided in param
  412                if config.get("header_file", None):
  413                    with open(config.get("header_file"), "rt") as f:
  414                        header_list = self.read_vcf_header(f)
  415                # within a vcf file format (header within input file itsself)
  416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
  417                    # within a compressed vcf file format (.vcf.gz)
  418                    if input_compressed:
  419                        with bgzf.open(input_file, "rt") as f:
  420                            header_list = self.read_vcf_header(f)
  421                    # within an uncompressed vcf file format (.vcf)
  422                    else:
  423                        with open(input_file, "rt") as f:
  424                            header_list = self.read_vcf_header(f)
  425                # header provided in default external file .hdr
  426                elif os.path.exists((input_file + ".hdr")):
  427                    with open(input_file + ".hdr", "rt") as f:
  428                        header_list = self.read_vcf_header(f)
  429                else:
  430                    try:  # Try to get header info fields and file columns
  431
  432                        with tempfile.TemporaryDirectory() as tmpdir:
  433
  434                            # Create database
  435                            db_for_header = Database(database=input_file)
  436
  437                            # Get header columns for infos fields
  438                            db_header_from_columns = (
  439                                db_for_header.get_header_from_columns()
  440                            )
  441
  442                            # Get real columns in the file
  443                            db_header_columns = db_for_header.get_columns()
  444
  445                            # Write header file
  446                            header_file_tmp = os.path.join(tmpdir, "header")
  447                            f = open(header_file_tmp, "w")
  448                            vcf.Writer(f, db_header_from_columns)
  449                            f.close()
  450
  451                            # Replace #CHROM line with rel columns
  452                            header_list = db_for_header.read_header_file(
  453                                header_file=header_file_tmp
  454                            )
  455                            header_list[-1] = "\t".join(db_header_columns)
  456
  457                    except:
  458
  459                        log.warning(
  460                            f"No header for file {input_file}. Set as default VCF header"
  461                        )
  462                        header_list = default_header_list
  463
  464            else:  # try for unknown format ?
  465
  466                log.error(f"Input file format '{input_format}' not available")
  467                raise ValueError(f"Input file format '{input_format}' not available")
  468
  469            if not header_list:
  470                header_list = default_header_list
  471
  472            # header as list
  473            self.header_list = header_list
  474
  475            # header as VCF object
  476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
  477
  478        else:
  479
  480            self.header_list = None
  481            self.header_vcf = None
  482
  483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
  484        """
  485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
  486        DataFrame based on the connection format.
  487
  488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
  489        represents the SQL query you want to execute. This query will be used to fetch data from a
  490        database and convert it into a pandas DataFrame
  491        :type query: str
  492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
  493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
  494        function will only fetch up to that number of rows from the database query result. If no limit
  495        is specified,
  496        :type limit: int
  497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
  498        """
  499
  500        # Connexion format
  501        connexion_format = self.get_connexion_format()
  502
  503        # Limit in query
  504        if limit:
  505            pd.set_option("display.max_rows", limit)
  506            if connexion_format in ["duckdb"]:
  507                df = (
  508                    self.conn.execute(query)
  509                    .fetch_record_batch(limit)
  510                    .read_next_batch()
  511                    .to_pandas()
  512                )
  513            elif connexion_format in ["sqlite"]:
  514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
  515
  516        # Full query
  517        else:
  518            if connexion_format in ["duckdb"]:
  519                df = self.conn.execute(query).df()
  520            elif connexion_format in ["sqlite"]:
  521                df = pd.read_sql_query(query, self.conn)
  522
  523        return df
  524
  525    def get_overview(self) -> None:
  526        """
  527        The function prints the input, output, config, and dataframe of the current object
  528        """
  529        table_variants_from = self.get_table_variants(clause="from")
  530        sql_columns = self.get_header_columns_as_sql()
  531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
  532        df = self.get_query_to_df(sql_query_export)
  533        log.info(
  534            "Input:  "
  535            + str(self.get_input())
  536            + " ["
  537            + str(str(self.get_input_format()))
  538            + "]"
  539        )
  540        log.info(
  541            "Output: "
  542            + str(self.get_output())
  543            + " ["
  544            + str(str(self.get_output_format()))
  545            + "]"
  546        )
  547        log.info("Config: ")
  548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
  549            "\n"
  550        ):
  551            log.info("\t" + str(d))
  552        log.info("Param: ")
  553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
  554            "\n"
  555        ):
  556            log.info("\t" + str(d))
  557        log.info("Sample list: " + str(self.get_header_sample_list()))
  558        log.info("Dataframe: ")
  559        for d in str(df).split("\n"):
  560            log.info("\t" + str(d))
  561
  562        # garbage collector
  563        del df
  564        gc.collect()
  565
  566        return None
  567
  568    def get_stats(self) -> dict:
  569        """
  570        The `get_stats` function calculates and returns various statistics of the current object,
  571        including information about the input file, variants, samples, header fields, quality, and
  572        SNVs/InDels.
  573        :return: a dictionary containing various statistics of the current object. The dictionary has
  574        the following structure:
  575        """
  576
  577        # Log
  578        log.info(f"Stats Calculation...")
  579
  580        # table varaints
  581        table_variants_from = self.get_table_variants()
  582
  583        # stats dict
  584        stats = {"Infos": {}}
  585
  586        ### File
  587        input_file = self.get_input()
  588        stats["Infos"]["Input file"] = input_file
  589
  590        # Header
  591        header_infos = self.get_header().infos
  592        header_formats = self.get_header().formats
  593        header_infos_list = list(header_infos)
  594        header_formats_list = list(header_formats)
  595
  596        ### Variants
  597
  598        stats["Variants"] = {}
  599
  600        # Variants by chr
  601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
  602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
  603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
  604            by=["CHROM"], kind="quicksort"
  605        )
  606
  607        # Total number of variants
  608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
  609
  610        # Calculate percentage
  611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
  612            lambda x: (x / nb_of_variants)
  613        )
  614
  615        stats["Variants"]["Number of variants by chromosome"] = (
  616            nb_of_variants_by_chrom.to_dict(orient="index")
  617        )
  618
  619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
  620
  621        ### Samples
  622
  623        # Init
  624        samples = {}
  625        nb_of_samples = 0
  626
  627        # Check Samples
  628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
  629            log.debug(f"Check samples...")
  630            for sample in self.get_header_sample_list():
  631                sql_query_samples = f"""
  632                    SELECT  '{sample}' as sample,
  633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
  634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
  635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
  636                    FROM {table_variants_from}
  637                    WHERE (
  638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
  639                        AND
  640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
  641                      )
  642                    GROUP BY genotype
  643                    """
  644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
  645                sample_genotype_count = sql_query_genotype_df["count"].sum()
  646                if len(sql_query_genotype_df):
  647                    nb_of_samples += 1
  648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
  649                        sql_query_genotype_df.to_dict(orient="index")
  650                    )
  651
  652            stats["Samples"] = samples
  653            stats["Infos"]["Number of samples"] = nb_of_samples
  654
  655        # #
  656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
  657        #     stats["Infos"]["Number of samples"] = nb_of_samples
  658        # elif nb_of_samples:
  659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
  660
  661        ### INFO and FORMAT fields
  662        header_types_df = {}
  663        header_types_list = {
  664            "List of INFO fields": header_infos,
  665            "List of FORMAT fields": header_formats,
  666        }
  667        i = 0
  668        for header_type in header_types_list:
  669
  670            header_type_infos = header_types_list.get(header_type)
  671            header_infos_dict = {}
  672
  673            for info in header_type_infos:
  674
  675                i += 1
  676                header_infos_dict[i] = {}
  677
  678                # ID
  679                header_infos_dict[i]["id"] = info
  680
  681                # num
  682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
  683                if header_type_infos[info].num in genotype_map.keys():
  684                    header_infos_dict[i]["Number"] = genotype_map.get(
  685                        header_type_infos[info].num
  686                    )
  687                else:
  688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
  689
  690                # type
  691                if header_type_infos[info].type:
  692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
  693                else:
  694                    header_infos_dict[i]["Type"] = "."
  695
  696                # desc
  697                if header_type_infos[info].desc != None:
  698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
  699                else:
  700                    header_infos_dict[i]["Description"] = ""
  701
  702            if len(header_infos_dict):
  703                header_types_df[header_type] = pd.DataFrame.from_dict(
  704                    header_infos_dict, orient="index"
  705                ).to_dict(orient="index")
  706
  707        # Stats
  708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
  709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
  710        stats["Header"] = header_types_df
  711
  712        ### QUAL
  713        if "QUAL" in self.get_header_columns():
  714            sql_query_qual = f"""
  715                    SELECT
  716                        avg(CAST(QUAL AS INTEGER)) AS Average,
  717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
  718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
  719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
  720                        median(CAST(QUAL AS INTEGER)) AS Median,
  721                        variance(CAST(QUAL AS INTEGER)) AS Variance
  722                    FROM {table_variants_from}
  723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
  724                    """
  725
  726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
  727            stats["Quality"] = {"Stats": qual}
  728
  729        ### SNV and InDel
  730
  731        sql_query_snv = f"""
  732            
  733            SELECT Type, count FROM (
  734
  735                    SELECT
  736                        'Total' AS Type,
  737                        count(*) AS count
  738                    FROM {table_variants_from}
  739
  740                    UNION
  741
  742                    SELECT
  743                        'MNV' AS Type,
  744                        count(*) AS count
  745                    FROM {table_variants_from}
  746                    WHERE len(REF) > 1 AND len(ALT) > 1
  747                    AND len(REF) = len(ALT)
  748
  749                    UNION
  750
  751                    SELECT
  752                        'InDel' AS Type,
  753                        count(*) AS count
  754                    FROM {table_variants_from}
  755                    WHERE len(REF) > 1 OR len(ALT) > 1
  756                    AND len(REF) != len(ALT)
  757                    
  758                    UNION
  759
  760                    SELECT
  761                        'SNV' AS Type,
  762                        count(*) AS count
  763                    FROM {table_variants_from}
  764                    WHERE len(REF) = 1 AND len(ALT) = 1
  765
  766                )
  767
  768            ORDER BY count DESC
  769
  770                """
  771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
  772
  773        sql_query_snv_substitution = f"""
  774                SELECT
  775                    concat(REF, '>', ALT) AS 'Substitution',
  776                    count(*) AS count
  777                FROM {table_variants_from}
  778                WHERE len(REF) = 1 AND len(ALT) = 1
  779                GROUP BY REF, ALT
  780                ORDER BY count(*) DESC
  781                """
  782        snv_substitution = (
  783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
  784        )
  785        stats["Variants"]["Counts"] = snv_indel
  786        stats["Variants"]["Substitutions"] = snv_substitution
  787
  788        return stats
  789
  790    def stats_to_file(self, file: str = None) -> str:
  791        """
  792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
  793        into a JSON object, and writes the JSON object to the specified file.
  794
  795        :param file: The `file` parameter is a string that represents the file path where the JSON data
  796        will be written
  797        :type file: str
  798        :return: the name of the file that was written to.
  799        """
  800
  801        # Get stats
  802        stats = self.get_stats()
  803
  804        # Serializing json
  805        json_object = json.dumps(stats, indent=4)
  806
  807        # Writing to sample.json
  808        with open(file, "w") as outfile:
  809            outfile.write(json_object)
  810
  811        return file
  812
  813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
  814        """
  815        The `print_stats` function generates a markdown file and prints the statistics contained in a
  816        JSON file in a formatted manner.
  817
  818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
  819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
  820        provided, a temporary directory will be created and the stats will be saved in a file named
  821        "stats.md" within that
  822        :type output_file: str
  823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
  824        file where the statistics will be saved. If no value is provided, a temporary directory will be
  825        created and a default file name "stats.json" will be used
  826        :type json_file: str
  827        :return: The function `print_stats` does not return any value. It has a return type annotation
  828        of `None`.
  829        """
  830
  831        # Full path
  832        output_file = full_path(output_file)
  833        json_file = full_path(json_file)
  834
  835        with tempfile.TemporaryDirectory() as tmpdir:
  836
  837            # Files
  838            if not output_file:
  839                output_file = os.path.join(tmpdir, "stats.md")
  840            if not json_file:
  841                json_file = os.path.join(tmpdir, "stats.json")
  842
  843            # Create folders
  844            if not os.path.exists(os.path.dirname(output_file)):
  845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
  846            if not os.path.exists(os.path.dirname(json_file)):
  847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
  848
  849            # Create stats JSON file
  850            stats_file = self.stats_to_file(file=json_file)
  851
  852            # Print stats file
  853            with open(stats_file) as f:
  854                stats = yaml.safe_load(f)
  855
  856            # Output
  857            output_title = []
  858            output_index = []
  859            output = []
  860
  861            # Title
  862            output_title.append("# HOWARD Stats")
  863
  864            # Index
  865            output_index.append("## Index")
  866
  867            # Process sections
  868            for section in stats:
  869                infos = stats.get(section)
  870                section_link = "#" + section.lower().replace(" ", "-")
  871                output.append(f"## {section}")
  872                output_index.append(f"- [{section}]({section_link})")
  873
  874                if len(infos):
  875                    for info in infos:
  876                        try:
  877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
  878                            is_df = True
  879                        except:
  880                            try:
  881                                df = pd.DataFrame.from_dict(
  882                                    json.loads((infos.get(info))), orient="index"
  883                                )
  884                                is_df = True
  885                            except:
  886                                is_df = False
  887                        if is_df:
  888                            output.append(f"### {info}")
  889                            info_link = "#" + info.lower().replace(" ", "-")
  890                            output_index.append(f"   - [{info}]({info_link})")
  891                            output.append(f"{df.to_markdown(index=False)}")
  892                        else:
  893                            output.append(f"- {info}: {infos.get(info)}")
  894                else:
  895                    output.append(f"NA")
  896
  897            # Write stats in markdown file
  898            with open(output_file, "w") as fp:
  899                for item in output_title:
  900                    fp.write("%s\n" % item)
  901                for item in output_index:
  902                    fp.write("%s\n" % item)
  903                for item in output:
  904                    fp.write("%s\n" % item)
  905
  906            # Output stats in markdown
  907            print("")
  908            print("\n\n".join(output_title))
  909            print("")
  910            print("\n\n".join(output))
  911            print("")
  912
  913        return None
  914
  915    def get_input(self) -> str:
  916        """
  917        It returns the value of the input variable.
  918        :return: The input is being returned.
  919        """
  920        return self.input
  921
  922    def get_input_format(self, input_file: str = None) -> str:
  923        """
  924        This function returns the format of the input variable, either from the provided input file or
  925        by prompting for input.
  926
  927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
  928        represents the file path of the input file. If no `input_file` is provided when calling the
  929        method, it will default to `None`
  930        :type input_file: str
  931        :return: The format of the input variable is being returned.
  932        """
  933
  934        if not input_file:
  935            input_file = self.get_input()
  936        input_format = get_file_format(input_file)
  937        return input_format
  938
  939    def get_input_compressed(self, input_file: str = None) -> str:
  940        """
  941        The function `get_input_compressed` returns the format of the input variable after compressing
  942        it.
  943
  944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
  945        that represents the file path of the input file. If no `input_file` is provided when calling the
  946        method, it will default to `None` and the method will then call `self.get_input()` to
  947        :type input_file: str
  948        :return: The function `get_input_compressed` returns the compressed format of the input
  949        variable.
  950        """
  951
  952        if not input_file:
  953            input_file = self.get_input()
  954        input_compressed = get_file_compressed(input_file)
  955        return input_compressed
  956
  957    def get_output(self) -> str:
  958        """
  959        It returns the output of the neuron.
  960        :return: The output of the neural network.
  961        """
  962
  963        return self.output
  964
  965    def get_output_format(self, output_file: str = None) -> str:
  966        """
  967        The function `get_output_format` returns the format of the input variable or the output file if
  968        provided.
  969
  970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
  971        that represents the file path of the output file. If no `output_file` is provided when calling
  972        the method, it will default to the output obtained from the `get_output` method of the class
  973        instance. The
  974        :type output_file: str
  975        :return: The format of the input variable is being returned.
  976        """
  977
  978        if not output_file:
  979            output_file = self.get_output()
  980        output_format = get_file_format(output_file)
  981
  982        return output_format
  983
  984    def get_config(self) -> dict:
  985        """
  986        It returns the config
  987        :return: The config variable is being returned.
  988        """
  989        return self.config
  990
  991    def get_param(self) -> dict:
  992        """
  993        It returns the param
  994        :return: The param variable is being returned.
  995        """
  996        return self.param
  997
  998    def get_connexion_db(self) -> str:
  999        """
 1000        It returns the connexion_db attribute of the object
 1001        :return: The connexion_db is being returned.
 1002        """
 1003        return self.connexion_db
 1004
 1005    def get_prefix(self) -> str:
 1006        """
 1007        It returns the prefix of the object.
 1008        :return: The prefix is being returned.
 1009        """
 1010        return self.prefix
 1011
 1012    def get_table_variants(self, clause: str = "select") -> str:
 1013        """
 1014        This function returns the table_variants attribute of the object
 1015
 1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
 1017        defaults to select (optional)
 1018        :return: The table_variants attribute of the object.
 1019        """
 1020
 1021        # Access
 1022        access = self.get_config().get("access", None)
 1023
 1024        # Clauses "select", "where", "update"
 1025        if clause in ["select", "where", "update"]:
 1026            table_variants = self.table_variants
 1027        # Clause "from"
 1028        elif clause in ["from"]:
 1029            # For Read Only
 1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
 1031                input_file = self.get_input()
 1032                table_variants = f"'{input_file}' as variants"
 1033            # For Read Write
 1034            else:
 1035                table_variants = f"{self.table_variants} as variants"
 1036        else:
 1037            table_variants = self.table_variants
 1038        return table_variants
 1039
 1040    def get_tmp_dir(self) -> str:
 1041        """
 1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
 1043        parameters or a default path.
 1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
 1045        configuration, parameters, and a default value of "/tmp".
 1046        """
 1047
 1048        return get_tmp(
 1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
 1050        )
 1051
 1052    def get_connexion_type(self) -> str:
 1053        """
 1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
 1055
 1056        :return: The connexion type is being returned.
 1057        """
 1058        return self.get_config().get("connexion_type", "memory")
 1059
 1060    def get_connexion(self):
 1061        """
 1062        It returns the connection object
 1063
 1064        :return: The connection object.
 1065        """
 1066        return self.conn
 1067
 1068    def close_connexion(self) -> None:
 1069        """
 1070        This function closes the connection to the database.
 1071        :return: The connection is being closed.
 1072        """
 1073        return self.conn.close()
 1074
 1075    def get_header(self, type: str = "vcf"):
 1076        """
 1077        This function returns the header of the VCF file as a list of strings
 1078
 1079        :param type: the type of header you want to get, defaults to vcf (optional)
 1080        :return: The header of the vcf file.
 1081        """
 1082
 1083        if self.header_vcf:
 1084            if type == "vcf":
 1085                return self.header_vcf
 1086            elif type == "list":
 1087                return self.header_list
 1088        else:
 1089            if type == "vcf":
 1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
 1091                return header
 1092            elif type == "list":
 1093                return vcf_required
 1094
 1095    def get_header_length(self, file: str = None) -> int:
 1096        """
 1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
 1098        line.
 1099
 1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
 1101        header file. If this argument is provided, the function will read the header from the specified
 1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
 1103        :type file: str
 1104        :return: the length of the header list, excluding the #CHROM line.
 1105        """
 1106
 1107        if file:
 1108            return len(self.read_vcf_header_file(file=file)) - 1
 1109        elif self.get_header(type="list"):
 1110            return len(self.get_header(type="list")) - 1
 1111        else:
 1112            return 0
 1113
 1114    def get_header_columns(self) -> str:
 1115        """
 1116        This function returns the header list of a VCF
 1117
 1118        :return: The length of the header list.
 1119        """
 1120        if self.get_header():
 1121            return self.get_header(type="list")[-1]
 1122        else:
 1123            return ""
 1124
 1125    def get_header_columns_as_list(self) -> list:
 1126        """
 1127        This function returns the header list of a VCF
 1128
 1129        :return: The length of the header list.
 1130        """
 1131        if self.get_header():
 1132            return self.get_header_columns().strip().split("\t")
 1133        else:
 1134            return []
 1135
 1136    def get_header_columns_as_sql(self) -> str:
 1137        """
 1138        This function retruns header length (without #CHROM line)
 1139
 1140        :return: The length of the header list.
 1141        """
 1142        sql_column_list = []
 1143        for col in self.get_header_columns_as_list():
 1144            sql_column_list.append(f'"{col}"')
 1145        return ",".join(sql_column_list)
 1146
 1147    def get_header_sample_list(
 1148        self, check: bool = False, samples: list = None, samples_force: bool = False
 1149    ) -> list:
 1150        """
 1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
 1152        checking and filtering based on input parameters.
 1153
 1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
 1155        parameter that determines whether to check if the samples in the list are properly defined as
 1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
 1157        list is defined as a, defaults to False
 1158        :type check: bool (optional)
 1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
 1160        allows you to specify a subset of samples from the header. If you provide a list of sample
 1161        names, the function will check if each sample is defined in the header. If a sample is not found
 1162        in the
 1163        :type samples: list
 1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
 1165        a boolean parameter that determines whether to force the function to return the sample list
 1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
 1167        function will return the sample list without performing, defaults to False
 1168        :type samples_force: bool (optional)
 1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
 1170        parameters and conditions specified in the function.
 1171        """
 1172
 1173        # Init
 1174        samples_list = []
 1175
 1176        if samples is None:
 1177            samples_list = self.header_vcf.samples
 1178        else:
 1179            samples_checked = []
 1180            for sample in samples:
 1181                if sample in self.header_vcf.samples:
 1182                    samples_checked.append(sample)
 1183                else:
 1184                    log.warning(f"Sample '{sample}' not defined in header")
 1185            samples_list = samples_checked
 1186
 1187            # Force sample list without checking if is_genotype_column
 1188            if samples_force:
 1189                log.warning(f"Samples {samples_list} not checked if genotypes")
 1190                return samples_list
 1191
 1192        if check:
 1193            samples_checked = []
 1194            for sample in samples_list:
 1195                if self.is_genotype_column(column=sample):
 1196                    samples_checked.append(sample)
 1197                else:
 1198                    log.warning(
 1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
 1200                    )
 1201            samples_list = samples_checked
 1202
 1203        # Return samples list
 1204        return samples_list
 1205
 1206    def is_genotype_column(self, column: str = None) -> bool:
 1207        """
 1208        This function checks if a given column is a genotype column in a database.
 1209
 1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
 1211        represents the column name in a database table. This method checks if the specified column is a
 1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
 1213        method of
 1214        :type column: str
 1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
 1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
 1217        column name and returns the result. If the `column` parameter is None, it returns False.
 1218        """
 1219
 1220        if column is not None:
 1221            return Database(database=self.get_input()).is_genotype_column(column=column)
 1222        else:
 1223            return False
 1224
 1225    def get_verbose(self) -> bool:
 1226        """
 1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
 1228        exist
 1229
 1230        :return: The value of the key "verbose" in the config dictionary.
 1231        """
 1232        return self.get_config().get("verbose", False)
 1233
 1234    def get_connexion_format(self) -> str:
 1235        """
 1236        It returns the connexion format of the object.
 1237        :return: The connexion_format is being returned.
 1238        """
 1239        connexion_format = self.connexion_format
 1240        if connexion_format not in ["duckdb", "sqlite"]:
 1241            log.error(f"Unknown connexion format {connexion_format}")
 1242            raise ValueError(f"Unknown connexion format {connexion_format}")
 1243        else:
 1244            return connexion_format
 1245
 1246    def insert_file_to_table(
 1247        self,
 1248        file,
 1249        columns: str,
 1250        header_len: int = 0,
 1251        sep: str = "\t",
 1252        chunksize: int = 1000000,
 1253    ) -> None:
 1254        """
 1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
 1256        database format.
 1257
 1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
 1259        the path to the file on your system
 1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
 1261        should contain the names of the columns in the table where the data will be inserted. The column
 1262        names should be separated by commas within the string. For example, if you have columns named
 1263        "id", "name
 1264        :type columns: str
 1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
 1266        the number of lines to skip at the beginning of the file before reading the actual data. This
 1267        parameter allows you to skip any header information present in the file before processing the
 1268        data, defaults to 0
 1269        :type header_len: int (optional)
 1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
 1271        separator character that is used in the file being read. In this case, the default separator is
 1272        set to `\t`, which represents a tab character. You can change this parameter to a different
 1273        separator character if, defaults to \t
 1274        :type sep: str (optional)
 1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
 1276        when processing the file in chunks. In the provided code snippet, the default value for
 1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
 1278        to 1000000
 1279        :type chunksize: int (optional)
 1280        """
 1281
 1282        # Config
 1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
 1284        connexion_format = self.get_connexion_format()
 1285
 1286        log.debug("chunksize: " + str(chunksize))
 1287
 1288        if chunksize:
 1289            for chunk in pd.read_csv(
 1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
 1291            ):
 1292                if connexion_format in ["duckdb"]:
 1293                    sql_insert_into = (
 1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
 1295                    )
 1296                    self.conn.execute(sql_insert_into)
 1297                elif connexion_format in ["sqlite"]:
 1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)
 1299
 1300    def load_data(
 1301        self,
 1302        input_file: str = None,
 1303        drop_variants_table: bool = False,
 1304        sample_size: int = 20480,
 1305    ) -> None:
 1306        """
 1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
 1308        table before loading the data and specify a sample size.
 1309
 1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
 1311        table
 1312        :type input_file: str
 1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
 1314        determines whether the variants table should be dropped before loading the data. If set to
 1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
 1316        not be dropped, defaults to False
 1317        :type drop_variants_table: bool (optional)
 1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
 1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
 1320        20480
 1321        :type sample_size: int (optional)
 1322        """
 1323
 1324        log.info("Loading...")
 1325
 1326        # change input file
 1327        if input_file:
 1328            self.set_input(input_file)
 1329            self.set_header()
 1330
 1331        # drop variants table
 1332        if drop_variants_table:
 1333            self.drop_variants_table()
 1334
 1335        # get table variants
 1336        table_variants = self.get_table_variants()
 1337
 1338        # Access
 1339        access = self.get_config().get("access", None)
 1340        log.debug(f"access: {access}")
 1341
 1342        # Input format and compress
 1343        input_format = self.get_input_format()
 1344        input_compressed = self.get_input_compressed()
 1345        log.debug(f"input_format: {input_format}")
 1346        log.debug(f"input_compressed: {input_compressed}")
 1347
 1348        # input_compressed_format
 1349        if input_compressed:
 1350            input_compressed_format = "gzip"
 1351        else:
 1352            input_compressed_format = "none"
 1353        log.debug(f"input_compressed_format: {input_compressed_format}")
 1354
 1355        # Connexion format
 1356        connexion_format = self.get_connexion_format()
 1357
 1358        # Sample size
 1359        if not sample_size:
 1360            sample_size = -1
 1361        log.debug(f"sample_size: {sample_size}")
 1362
 1363        # Load data
 1364        log.debug(f"Load Data from {input_format}")
 1365
 1366        # DuckDB connexion
 1367        if connexion_format in ["duckdb"]:
 1368
 1369            # Database already exists
 1370            if self.input_format in ["db", "duckdb"]:
 1371
 1372                if connexion_format in ["duckdb"]:
 1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
 1374                else:
 1375                    log.error(
 1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1377                    )
 1378                    raise ValueError(
 1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
 1380                    )
 1381
 1382            # Load from existing database format
 1383            else:
 1384
 1385                try:
 1386                    # Create Table or View
 1387                    database = Database(database=self.input)
 1388                    sql_from = database.get_sql_from(sample_size=sample_size)
 1389
 1390                    if access in ["RO"]:
 1391                        sql_load = (
 1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
 1393                        )
 1394                    else:
 1395                        sql_load = (
 1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
 1397                        )
 1398                    self.conn.execute(sql_load)
 1399
 1400                except:
 1401                    # Format not available
 1402                    log.error(f"Input file format '{self.input_format}' not available")
 1403                    raise ValueError(
 1404                        f"Input file format '{self.input_format}' not available"
 1405                    )
 1406
 1407        # SQLite connexion
 1408        elif connexion_format in ["sqlite"] and input_format in [
 1409            "vcf",
 1410            "tsv",
 1411            "csv",
 1412            "psv",
 1413        ]:
 1414
 1415            # Main structure
 1416            structure = {
 1417                "#CHROM": "VARCHAR",
 1418                "POS": "INTEGER",
 1419                "ID": "VARCHAR",
 1420                "REF": "VARCHAR",
 1421                "ALT": "VARCHAR",
 1422                "QUAL": "VARCHAR",
 1423                "FILTER": "VARCHAR",
 1424                "INFO": "VARCHAR",
 1425            }
 1426
 1427            # Strcuture with samples
 1428            structure_complete = structure
 1429            if self.get_header_sample_list():
 1430                structure["FORMAT"] = "VARCHAR"
 1431                for sample in self.get_header_sample_list():
 1432                    structure_complete[sample] = "VARCHAR"
 1433
 1434            # Columns list for create and insert
 1435            sql_create_table_columns = []
 1436            sql_create_table_columns_list = []
 1437            for column in structure_complete:
 1438                column_type = structure_complete[column]
 1439                sql_create_table_columns.append(
 1440                    f'"{column}" {column_type} default NULL'
 1441                )
 1442                sql_create_table_columns_list.append(f'"{column}"')
 1443
 1444            # Create database
 1445            log.debug(f"Create Table {table_variants}")
 1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
 1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
 1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
 1449            self.conn.execute(sql_create_table)
 1450
 1451            # chunksize define length of file chunk load file
 1452            chunksize = 100000
 1453
 1454            # delimiter
 1455            delimiter = file_format_delimiters.get(input_format, "\t")
 1456
 1457            # Load the input file
 1458            with open(self.input, "rt") as input_file:
 1459
 1460                # Use the appropriate file handler based on the input format
 1461                if input_compressed:
 1462                    input_file = bgzf.open(self.input, "rt")
 1463                if input_format in ["vcf"]:
 1464                    header_len = self.get_header_length()
 1465                else:
 1466                    header_len = 0
 1467
 1468                # Insert the file contents into a table
 1469                self.insert_file_to_table(
 1470                    input_file,
 1471                    columns=sql_create_table_columns_list_sql,
 1472                    header_len=header_len,
 1473                    sep=delimiter,
 1474                    chunksize=chunksize,
 1475                )
 1476
 1477        else:
 1478            log.error(
 1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1480            )
 1481            raise ValueError(
 1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
 1483            )
 1484
 1485        # Explode INFOS fields into table fields
 1486        if self.get_explode_infos():
 1487            self.explode_infos(
 1488                prefix=self.get_explode_infos_prefix(),
 1489                fields=self.get_explode_infos_fields(),
 1490                force=True,
 1491            )
 1492
 1493        # Create index after insertion
 1494        self.create_indexes()
 1495
 1496    def get_explode_infos(self) -> bool:
 1497        """
 1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
 1499        to False if it is not set.
 1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
 1501        value. If the parameter is not present, it will return False.
 1502        """
 1503
 1504        return self.get_param().get("explode", {}).get("explode_infos", False)
 1505
 1506    def get_explode_infos_fields(
 1507        self,
 1508        explode_infos_fields: str = None,
 1509        remove_fields_not_in_header: bool = False,
 1510    ) -> list:
 1511        """
 1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
 1513        the input parameter `explode_infos_fields`.
 1514
 1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
 1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
 1517        comma-separated list of field names to explode
 1518        :type explode_infos_fields: str
 1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
 1520        flag that determines whether to remove fields that are not present in the header. If it is set
 1521        to `True`, any field that is not in the header will be excluded from the list of exploded
 1522        information fields. If it is set to `, defaults to False
 1523        :type remove_fields_not_in_header: bool (optional)
 1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
 1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
 1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
 1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
 1528        splitting the string by commas.
 1529        """
 1530
 1531        # If no fields, get it in param
 1532        if not explode_infos_fields:
 1533            explode_infos_fields = (
 1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
 1535            )
 1536
 1537        # If no fields, defined as all fields in header using keyword
 1538        if not explode_infos_fields:
 1539            explode_infos_fields = "*"
 1540
 1541        # If fields list not empty
 1542        if explode_infos_fields:
 1543
 1544            # Input fields list
 1545            if isinstance(explode_infos_fields, str):
 1546                fields_input = explode_infos_fields.split(",")
 1547            elif isinstance(explode_infos_fields, list):
 1548                fields_input = explode_infos_fields
 1549            else:
 1550                fields_input = []
 1551
 1552            # Fields list without * keyword
 1553            fields_without_all = fields_input.copy()
 1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
 1555                fields_without_all.remove("*")
 1556
 1557            # Fields in header
 1558            fields_in_header = sorted(list(set(self.get_header().infos)))
 1559
 1560            # Construct list of fields
 1561            fields_output = []
 1562            for field in fields_input:
 1563
 1564                # Strip field
 1565                field = field.strip()
 1566
 1567                # format keyword * in regex
 1568                if field.upper() in ["*"]:
 1569                    field = ".*"
 1570
 1571                # Find all fields with pattern
 1572                r = re.compile(field)
 1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
 1574
 1575                # Remove fields input from search
 1576                if field in fields_search:
 1577                    fields_search = [field]
 1578                elif fields_search != [field]:
 1579                    fields_search = sorted(
 1580                        list(set(fields_search).difference(fields_input))
 1581                    )
 1582
 1583                # If field is not in header (avoid not well formatted header)
 1584                if not fields_search and not remove_fields_not_in_header:
 1585                    fields_search = [field]
 1586
 1587                # Add found fields
 1588                for new_field in fields_search:
 1589                    # Add field, if not already exists, and if it is in header (if asked)
 1590                    if (
 1591                        new_field not in fields_output
 1592                        and (
 1593                            not remove_fields_not_in_header
 1594                            or new_field in fields_in_header
 1595                        )
 1596                        and new_field not in [".*"]
 1597                    ):
 1598                        fields_output.append(new_field)
 1599
 1600            return fields_output
 1601
 1602        else:
 1603
 1604            return []
 1605
 1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
 1607        """
 1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
 1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
 1610        not provided.
 1611
 1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
 1613        prefix to be used for exploding or expanding information
 1614        :type explode_infos_prefix: str
 1615        :return: the value of the variable `explode_infos_prefix`.
 1616        """
 1617
 1618        if not explode_infos_prefix:
 1619            explode_infos_prefix = (
 1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
 1621            )
 1622
 1623        return explode_infos_prefix
 1624
 1625    def add_column(
 1626        self,
 1627        table_name,
 1628        column_name,
 1629        column_type,
 1630        default_value=None,
 1631        drop: bool = False,
 1632    ) -> dict:
 1633        """
 1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
 1635        doesn't already exist.
 1636
 1637        :param table_name: The name of the table to which you want to add a column
 1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
 1639        to the table
 1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
 1641        want to add to the table. It should be a string that represents the desired data type, such as
 1642        "INTEGER", "TEXT", "REAL", etc
 1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
 1644        default value for the newly added column. If a default value is provided, it will be assigned to
 1645        the column for any existing rows that do not have a value for that column
 1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
 1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
 1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
 1649        to False
 1650        :type drop: bool (optional)
 1651        :return: a boolean value indicating whether the column was successfully added to the table.
 1652        """
 1653
 1654        # added
 1655        added = False
 1656        dropped = False
 1657
 1658        # Check if the column already exists in the table
 1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1660        columns = self.get_query_to_df(query).columns.tolist()
 1661        if column_name.upper() in [c.upper() for c in columns]:
 1662            log.debug(
 1663                f"The {column_name} column already exists in the {table_name} table"
 1664            )
 1665            if drop:
 1666                self.drop_column(table_name=table_name, column_name=column_name)
 1667                dropped = True
 1668            else:
 1669                return None
 1670        else:
 1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1672
 1673        # Add column in table
 1674        add_column_query = (
 1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
 1676        )
 1677        if default_value is not None:
 1678            add_column_query += f" DEFAULT {default_value}"
 1679        self.execute_query(add_column_query)
 1680        added = not dropped
 1681        log.debug(
 1682            f"The {column_name} column was successfully added to the {table_name} table"
 1683        )
 1684
 1685        if added:
 1686            added_column = {
 1687                "table_name": table_name,
 1688                "column_name": column_name,
 1689                "column_type": column_type,
 1690                "default_value": default_value,
 1691            }
 1692        else:
 1693            added_column = None
 1694
 1695        return added_column
 1696
 1697    def drop_column(
 1698        self, column: dict = None, table_name: str = None, column_name: str = None
 1699    ) -> bool:
 1700        """
 1701        The `drop_column` function drops a specified column from a given table in a database and returns
 1702        True if the column was successfully dropped, and False if the column does not exist in the
 1703        table.
 1704
 1705        :param column: The `column` parameter is a dictionary that contains information about the column
 1706        you want to drop. It has two keys:
 1707        :type column: dict
 1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
 1709        drop a column
 1710        :type table_name: str
 1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
 1712        from the table
 1713        :type column_name: str
 1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
 1715        and False if the column does not exist in the table.
 1716        """
 1717
 1718        # Find column infos
 1719        if column:
 1720            if isinstance(column, dict):
 1721                table_name = column.get("table_name", None)
 1722                column_name = column.get("column_name", None)
 1723            elif isinstance(column, str):
 1724                table_name = self.get_table_variants()
 1725                column_name = column
 1726            else:
 1727                table_name = None
 1728                column_name = None
 1729
 1730        if not table_name and not column_name:
 1731            return False
 1732
 1733        # Removed
 1734        removed = False
 1735
 1736        # Check if the column already exists in the table
 1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
 1738        columns = self.get_query_to_df(query).columns.tolist()
 1739        if column_name in columns:
 1740            log.debug(f"The {column_name} column exists in the {table_name} table")
 1741        else:
 1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
 1743            return False
 1744
 1745        # Add column in table # ALTER TABLE integers DROP k
 1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
 1747        self.execute_query(add_column_query)
 1748        removed = True
 1749        log.debug(
 1750            f"The {column_name} column was successfully dropped to the {table_name} table"
 1751        )
 1752
 1753        return removed
 1754
 1755    def explode_infos(
 1756        self,
 1757        prefix: str = None,
 1758        create_index: bool = False,
 1759        fields: list = None,
 1760        force: bool = False,
 1761        proccess_all_fields_together: bool = False,
 1762        table: str = None,
 1763    ) -> list:
 1764        """
 1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
 1766        individual columns, returning a list of added columns.
 1767
 1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
 1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
 1770        `self.get_explode_infos_prefix()` as the prefix
 1771        :type prefix: str
 1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
 1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
 1774        `False`, indexes will not be created. The default value is `False`, defaults to False
 1775        :type create_index: bool (optional)
 1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
 1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
 1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
 1779        a list to the `
 1780        :type fields: list
 1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
 1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
 1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
 1784        defaults to False
 1785        :type force: bool (optional)
 1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
 1787        flag that determines whether to process all the INFO fields together or individually. If set to
 1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
 1789        be processed individually. The default value is, defaults to False
 1790        :type proccess_all_fields_together: bool (optional)
 1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
 1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
 1793        a value for the `table` parameter, the function will use that table name. If the `table`
 1794        parameter is
 1795        :type table: str
 1796        :return: The `explode_infos` function returns a list of added columns.
 1797        """
 1798
 1799        # drop indexes
 1800        self.drop_indexes()
 1801
 1802        # connexion format
 1803        connexion_format = self.get_connexion_format()
 1804
 1805        # Access
 1806        access = self.get_config().get("access", None)
 1807
 1808        # Added columns
 1809        added_columns = []
 1810
 1811        if access not in ["RO"]:
 1812
 1813            # prefix
 1814            if prefix in [None, True] or not isinstance(prefix, str):
 1815                if self.get_explode_infos_prefix() not in [None, True]:
 1816                    prefix = self.get_explode_infos_prefix()
 1817                else:
 1818                    prefix = "INFO/"
 1819
 1820            # table variants
 1821            if table is not None:
 1822                table_variants = table
 1823            else:
 1824                table_variants = self.get_table_variants(clause="select")
 1825
 1826            # extra infos
 1827            try:
 1828                extra_infos = self.get_extra_infos()
 1829            except:
 1830                extra_infos = []
 1831
 1832            # Header infos
 1833            header_infos = self.get_header().infos
 1834
 1835            log.debug(
 1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
 1837            )
 1838
 1839            sql_info_alter_table_array = []
 1840
 1841            # Info fields to check
 1842            fields_list = list(header_infos)
 1843            if fields:
 1844                fields_list += fields
 1845            fields_list = set(fields_list)
 1846
 1847            # If no fields
 1848            if not fields:
 1849                fields = []
 1850
 1851            # Translate fields if patterns
 1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
 1853
 1854            for info in fields:
 1855
 1856                info_id_sql = prefix + info
 1857
 1858                if (
 1859                    info in fields_list
 1860                    or prefix + info in fields_list
 1861                    or info in extra_infos
 1862                ):
 1863
 1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
 1865
 1866                    if info in header_infos:
 1867                        info_type = header_infos[info].type
 1868                        info_num = header_infos[info].num
 1869                    else:
 1870                        info_type = "String"
 1871                        info_num = 0
 1872
 1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
 1874                    if info_num != 1:
 1875                        type_sql = "VARCHAR"
 1876
 1877                    # Add field
 1878                    added_column = self.add_column(
 1879                        table_name=table_variants,
 1880                        column_name=info_id_sql,
 1881                        column_type=type_sql,
 1882                        default_value="null",
 1883                        drop=force,
 1884                    )
 1885
 1886                    if added_column:
 1887                        added_columns.append(added_column)
 1888
 1889                    if added_column or force:
 1890
 1891                        # add field to index
 1892                        self.index_additionnal_fields.append(info_id_sql)
 1893
 1894                        # Update field array
 1895                        if connexion_format in ["duckdb"]:
 1896                            update_info_field = f"""
 1897                            "{info_id_sql}" =
 1898                                CASE
 1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
 1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
 1901                                END
 1902                            """
 1903                        elif connexion_format in ["sqlite"]:
 1904                            update_info_field = f"""
 1905                                "{info_id_sql}" =
 1906                                    CASE
 1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
 1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
 1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
 1910                                    END
 1911                            """
 1912
 1913                        sql_info_alter_table_array.append(update_info_field)
 1914
 1915            if sql_info_alter_table_array:
 1916
 1917                # By chromosomes
 1918                try:
 1919                    chromosomes_list = list(
 1920                        self.get_query_to_df(
 1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
 1922                        )["#CHROM"]
 1923                    )
 1924                except:
 1925                    chromosomes_list = [None]
 1926
 1927                for chrom in chromosomes_list:
 1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
 1929
 1930                    # Where clause
 1931                    where_clause = ""
 1932                    if chrom and len(chromosomes_list) > 1:
 1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
 1934
 1935                    # Update table
 1936                    if proccess_all_fields_together:
 1937                        sql_info_alter_table_array_join = ", ".join(
 1938                            sql_info_alter_table_array
 1939                        )
 1940                        if sql_info_alter_table_array_join:
 1941                            sql_info_alter_table = f"""
 1942                                UPDATE {table_variants}
 1943                                SET {sql_info_alter_table_array_join}
 1944                                {where_clause}
 1945                                """
 1946                            log.debug(
 1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
 1948                            )
 1949                            # log.debug(sql_info_alter_table)
 1950                            self.conn.execute(sql_info_alter_table)
 1951                    else:
 1952                        sql_info_alter_num = 0
 1953                        for sql_info_alter in sql_info_alter_table_array:
 1954                            sql_info_alter_num += 1
 1955                            sql_info_alter_table = f"""
 1956                                UPDATE {table_variants}
 1957                                SET {sql_info_alter}
 1958                                {where_clause}
 1959                                """
 1960                            log.debug(
 1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
 1962                            )
 1963                            # log.debug(sql_info_alter_table)
 1964                            self.conn.execute(sql_info_alter_table)
 1965
 1966        # create indexes
 1967        if create_index:
 1968            self.create_indexes()
 1969
 1970        return added_columns
 1971
 1972    def create_indexes(self) -> None:
 1973        """
 1974        Create indexes on the table after insertion
 1975        """
 1976
 1977        # Access
 1978        access = self.get_config().get("access", None)
 1979
 1980        # get table variants
 1981        table_variants = self.get_table_variants("FROM")
 1982
 1983        if self.get_indexing() and access not in ["RO"]:
 1984            # Create index
 1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
 1986            self.conn.execute(sql_create_table_index)
 1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
 1988            self.conn.execute(sql_create_table_index)
 1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
 1990            self.conn.execute(sql_create_table_index)
 1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
 1992            self.conn.execute(sql_create_table_index)
 1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
 1994            self.conn.execute(sql_create_table_index)
 1995            for field in self.index_additionnal_fields:
 1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
 1997                self.conn.execute(sql_create_table_index)
 1998
 1999    def drop_indexes(self) -> None:
 2000        """
 2001        Create indexes on the table after insertion
 2002        """
 2003
 2004        # Access
 2005        access = self.get_config().get("access", None)
 2006
 2007        # get table variants
 2008        table_variants = self.get_table_variants("FROM")
 2009
 2010        # Get database format
 2011        connexion_format = self.get_connexion_format()
 2012
 2013        if access not in ["RO"]:
 2014            if connexion_format in ["duckdb"]:
 2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
 2016            elif connexion_format in ["sqlite"]:
 2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
 2018
 2019            list_indexes = self.conn.execute(sql_list_indexes)
 2020            index_names = [row[0] for row in list_indexes.fetchall()]
 2021            for index in index_names:
 2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
 2023                self.conn.execute(sql_drop_table_index)
 2024
 2025    def read_vcf_header(self, f) -> list:
 2026        """
 2027        It reads the header of a VCF file and returns a list of the header lines
 2028
 2029        :param f: the file object
 2030        :return: The header lines of the VCF file.
 2031        """
 2032
 2033        header_list = []
 2034        for line in f:
 2035            header_list.append(line)
 2036            if line.startswith("#CHROM"):
 2037                break
 2038        return header_list
 2039
 2040    def read_vcf_header_file(self, file: str = None) -> list:
 2041        """
 2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
 2043        uncompressed files.
 2044
 2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
 2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
 2047        default to `None`
 2048        :type file: str
 2049        :return: The function `read_vcf_header_file` returns a list.
 2050        """
 2051
 2052        if self.get_input_compressed(input_file=file):
 2053            with bgzf.open(file, "rt") as f:
 2054                return self.read_vcf_header(f=f)
 2055        else:
 2056            with open(file, "rt") as f:
 2057                return self.read_vcf_header(f=f)
 2058
 2059    def execute_query(self, query: str):
 2060        """
 2061        It takes a query as an argument, executes it, and returns the results
 2062
 2063        :param query: The query to be executed
 2064        :return: The result of the query is being returned.
 2065        """
 2066        if query:
 2067            return self.conn.execute(query)  # .fetchall()
 2068        else:
 2069            return None
 2070
 2071    def export_output(
 2072        self,
 2073        output_file: str | None = None,
 2074        output_header: str | None = None,
 2075        export_header: bool = True,
 2076        query: str | None = None,
 2077        parquet_partitions: list | None = None,
 2078        chunk_size: int | None = None,
 2079        threads: int | None = None,
 2080        sort: bool = False,
 2081        index: bool = False,
 2082        order_by: str | None = None,
 2083    ) -> bool:
 2084        """
 2085        The `export_output` function exports data from a VCF file to a specified output file in various
 2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
 2087
 2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
 2089        output file to be generated by the function. This is where the exported data will be saved
 2090        :type output_file: str
 2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
 2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
 2093        header will be exported to a file with the same name as the `output_file` parameter, but with
 2094        the extension "
 2095        :type output_header: str
 2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
 2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
 2098        True, the header will be exported to a file. If `export_header` is False, the header will not
 2099        be, defaults to True, if output format is not VCF
 2100        :type export_header: bool (optional)
 2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
 2102        select specific data from the VCF file before exporting it. If provided, only the data that
 2103        matches the query will be exported
 2104        :type query: str
 2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
 2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
 2107        organize data in a hierarchical directory structure based on the values of one or more columns.
 2108        This can improve query performance when working with large datasets
 2109        :type parquet_partitions: list
 2110        :param chunk_size: The `chunk_size` parameter specifies the number of
 2111        records in batch when exporting data in Parquet format. This parameter is used for
 2112        partitioning the Parquet file into multiple files.
 2113        :type chunk_size: int
 2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
 2115        threads to be used during the export process. It determines the level of parallelism and can
 2116        improve the performance of the export operation. If not provided, the function will use the
 2117        default number of threads
 2118        :type threads: int
 2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
 2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
 2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
 2122        False
 2123        :type sort: bool (optional)
 2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
 2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
 2126        no index will be created. The default value is False, defaults to False
 2127        :type index: bool (optional)
 2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
 2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
 2130        :type order_by: str
 2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
 2132        None if it doesn't.
 2133        """
 2134
 2135        # Log
 2136        log.info("Exporting...")
 2137
 2138        # Full path
 2139        output_file = full_path(output_file)
 2140        output_header = full_path(output_header)
 2141
 2142        # Config
 2143        config = self.get_config()
 2144
 2145        # Param
 2146        param = self.get_param()
 2147
 2148        # Tmp files to remove
 2149        tmp_to_remove = []
 2150
 2151        # If no output, get it
 2152        if not output_file:
 2153            output_file = self.get_output()
 2154
 2155        # If not threads
 2156        if not threads:
 2157            threads = self.get_threads()
 2158
 2159        # Auto header name with extension
 2160        if export_header or output_header:
 2161            if not output_header:
 2162                output_header = f"{output_file}.hdr"
 2163            # Export header
 2164            self.export_header(output_file=output_file)
 2165
 2166        # Switch off export header if VCF output
 2167        output_file_type = get_file_format(output_file)
 2168        if output_file_type in ["vcf"]:
 2169            export_header = False
 2170            tmp_to_remove.append(output_header)
 2171
 2172        # Chunk size
 2173        if not chunk_size:
 2174            chunk_size = config.get("chunk_size", None)
 2175
 2176        # Parquet partition
 2177        if not parquet_partitions:
 2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
 2179        if parquet_partitions and isinstance(parquet_partitions, str):
 2180            parquet_partitions = parquet_partitions.split(",")
 2181
 2182        # Order by
 2183        if not order_by:
 2184            order_by = param.get("export", {}).get("order_by", "")
 2185
 2186        # Header in output
 2187        header_in_output = param.get("export", {}).get("include_header", False)
 2188
 2189        # Database
 2190        database_source = self.get_connexion()
 2191
 2192        # Connexion format
 2193        connexion_format = self.get_connexion_format()
 2194
 2195        # Explode infos
 2196        if self.get_explode_infos():
 2197            self.explode_infos(
 2198                prefix=self.get_explode_infos_prefix(),
 2199                fields=self.get_explode_infos_fields(),
 2200                force=False,
 2201            )
 2202
 2203        # if connexion_format in ["sqlite"] or query:
 2204        if connexion_format in ["sqlite"]:
 2205
 2206            # Export in Parquet
 2207            random_tmp = "".join(
 2208                random.choice(string.ascii_lowercase) for i in range(10)
 2209            )
 2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
 2211            tmp_to_remove.append(database_source)
 2212
 2213            # Table Variants
 2214            table_variants = self.get_table_variants()
 2215
 2216            # Create export query
 2217            sql_query_export_subquery = f"""
 2218                SELECT * FROM {table_variants}
 2219                """
 2220
 2221            # Write source file
 2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
 2223
 2224        # Create database
 2225        database = Database(
 2226            database=database_source,
 2227            table="variants",
 2228            header_file=output_header,
 2229            conn_config=self.get_connexion_config(),
 2230        )
 2231
 2232        # Existing colomns header
 2233        existing_columns_header = database.get_header_columns_from_database()
 2234
 2235        # Sample list
 2236        get_samples = self.get_samples()
 2237        get_samples_check = self.get_samples_check()
 2238        samples_force = get_samples is not None
 2239        sample_list = self.get_header_sample_list(
 2240            check=get_samples_check, samples=get_samples, samples_force=samples_force
 2241        )
 2242
 2243        # Export file
 2244        database.export(
 2245            output_database=output_file,
 2246            output_header=output_header,
 2247            existing_columns_header=existing_columns_header,
 2248            parquet_partitions=parquet_partitions,
 2249            chunk_size=chunk_size,
 2250            threads=threads,
 2251            sort=sort,
 2252            index=index,
 2253            header_in_output=header_in_output,
 2254            order_by=order_by,
 2255            query=query,
 2256            export_header=export_header,
 2257            sample_list=sample_list,
 2258        )
 2259
 2260        # Remove
 2261        remove_if_exists(tmp_to_remove)
 2262
 2263        return (os.path.exists(output_file) or None) and (
 2264            os.path.exists(output_file) or None
 2265        )
 2266
 2267    def get_extra_infos(self, table: str = None) -> list:
 2268        """
 2269        The `get_extra_infos` function returns a list of columns that are in a specified table but not
 2270        in the header.
 2271
 2272        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
 2273        name of the table from which you want to retrieve the extra columns that are not present in the
 2274        header. If the `table` parameter is not provided when calling the function, it will default to
 2275        using the variants
 2276        :type table: str
 2277        :return: A list of columns that are in the specified table but not in the header of the table.
 2278        """
 2279
 2280        header_columns = []
 2281
 2282        if not table:
 2283            table = self.get_table_variants(clause="from")
 2284            header_columns = self.get_header_columns()
 2285
 2286        # Check all columns in the database
 2287        query = f""" SELECT * FROM {table} LIMIT 1 """
 2288        log.debug(f"query {query}")
 2289        table_columns = self.get_query_to_df(query).columns.tolist()
 2290        extra_columns = []
 2291
 2292        # Construct extra infos (not in header)
 2293        for column in table_columns:
 2294            if column not in header_columns:
 2295                extra_columns.append(column)
 2296
 2297        return extra_columns
 2298
 2299    def get_extra_infos_sql(self, table: str = None) -> str:
 2300        """
 2301        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
 2302        by double quotes
 2303
 2304        :param table: The name of the table to get the extra infos from. If None, the default table is
 2305        used
 2306        :type table: str
 2307        :return: A string of the extra infos
 2308        """
 2309
 2310        return ", ".join(
 2311            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
 2312        )
 2313
 2314    def export_header(
 2315        self,
 2316        header_name: str = None,
 2317        output_file: str = None,
 2318        output_file_ext: str = ".hdr",
 2319        clean_header: bool = True,
 2320        remove_chrom_line: bool = False,
 2321    ) -> str:
 2322        """
 2323        The `export_header` function takes a VCF file, extracts the header, modifies it according to
 2324        specified options, and writes it to a new file.
 2325
 2326        :param header_name: The `header_name` parameter is the name of the header file to be created. If
 2327        this parameter is not specified, the header will be written to the output file
 2328        :type header_name: str
 2329        :param output_file: The `output_file` parameter in the `export_header` function is used to
 2330        specify the name of the output file where the header will be written. If this parameter is not
 2331        provided, the header will be written to a temporary file
 2332        :type output_file: str
 2333        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
 2334        string that represents the extension of the output header file. By default, it is set to ".hdr"
 2335        if not specified by the user. This extension will be appended to the `output_file` name to
 2336        create the final, defaults to .hdr
 2337        :type output_file_ext: str (optional)
 2338        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
 2339        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
 2340        `True`, the function will clean the header by modifying certain lines based on a specific
 2341        pattern. If `clean_header`, defaults to True
 2342        :type clean_header: bool (optional)
 2343        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
 2344        boolean flag that determines whether the #CHROM line should be removed from the header before
 2345        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
 2346        defaults to False
 2347        :type remove_chrom_line: bool (optional)
 2348        :return: The function `export_header` returns the name of the temporary header file that is
 2349        created.
 2350        """
 2351
 2352        if not header_name and not output_file:
 2353            output_file = self.get_output()
 2354
 2355        if self.get_header():
 2356
 2357            # Get header object
 2358            header_obj = self.get_header()
 2359
 2360            # Create database
 2361            db_for_header = Database(database=self.get_input())
 2362
 2363            # Get real columns in the file
 2364            db_header_columns = db_for_header.get_columns()
 2365
 2366            with tempfile.TemporaryDirectory() as tmpdir:
 2367
 2368                # Write header file
 2369                header_file_tmp = os.path.join(tmpdir, "header")
 2370                f = open(header_file_tmp, "w")
 2371                vcf.Writer(f, header_obj)
 2372                f.close()
 2373
 2374                # Replace #CHROM line with rel columns
 2375                header_list = db_for_header.read_header_file(
 2376                    header_file=header_file_tmp
 2377                )
 2378                header_list[-1] = "\t".join(db_header_columns)
 2379
 2380                # Remove CHROM line
 2381                if remove_chrom_line:
 2382                    header_list.pop()
 2383
 2384                # Clean header
 2385                if clean_header:
 2386                    header_list_clean = []
 2387                    for head in header_list:
 2388                        # Clean head for malformed header
 2389                        head_clean = head
 2390                        head_clean = re.subn(
 2391                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
 2392                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
 2393                            head_clean,
 2394                            2,
 2395                        )[0]
 2396                        # Write header
 2397                        header_list_clean.append(head_clean)
 2398                    header_list = header_list_clean
 2399
 2400            tmp_header_name = output_file + output_file_ext
 2401
 2402            f = open(tmp_header_name, "w")
 2403            for line in header_list:
 2404                f.write(line)
 2405            f.close()
 2406
 2407        return tmp_header_name
 2408
 2409    def export_variant_vcf(
 2410        self,
 2411        vcf_file,
 2412        remove_info: bool = False,
 2413        add_samples: bool = True,
 2414        list_samples: list = [],
 2415        where_clause: str = "",
 2416        index: bool = False,
 2417        threads: int | None = None,
 2418    ) -> bool | None:
 2419        """
 2420        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
 2421        remove INFO field, add samples, and control compression and indexing.
 2422
 2423        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
 2424        written to. It is the output file that will contain the filtered VCF data based on the specified
 2425        parameters
 2426        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
 2427        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
 2428        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
 2429        in, defaults to False
 2430        :type remove_info: bool (optional)
 2431        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
 2432        the samples should be added to the VCF file or not. If set to True, the samples will be added.
 2433        If set to False, the samples will be removed. The default value is True, defaults to True
 2434        :type add_samples: bool (optional)
 2435        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
 2436        in the output VCF file. By default, all samples will be included. If you provide a list of
 2437        samples, only those samples will be included in the output file
 2438        :type list_samples: list
 2439        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
 2440        determines whether or not to create an index for the output VCF file. If `index` is set to
 2441        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
 2442        :type index: bool (optional)
 2443        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
 2444        number of threads to use for exporting the VCF file. It determines how many parallel threads
 2445        will be used during the export process. More threads can potentially speed up the export process
 2446        by utilizing multiple cores of the processor. If
 2447        :type threads: int | None
 2448        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
 2449        method with various parameters including the output file, query, threads, sort flag, and index
 2450        flag. The `export_output` method is responsible for exporting the VCF data based on the
 2451        specified parameters and configurations provided in the `export_variant_vcf` function.
 2452        """
 2453
 2454        # Config
 2455        config = self.get_config()
 2456
 2457        # Extract VCF
 2458        log.debug("Export VCF...")
 2459
 2460        # Table variants
 2461        table_variants = self.get_table_variants()
 2462
 2463        # Threads
 2464        if not threads:
 2465            threads = self.get_threads()
 2466
 2467        # Info fields
 2468        if remove_info:
 2469            if not isinstance(remove_info, str):
 2470                remove_info = "."
 2471            info_field = f"""'{remove_info}' as INFO"""
 2472        else:
 2473            info_field = "INFO"
 2474
 2475        # Samples fields
 2476        if add_samples:
 2477            if not list_samples:
 2478                list_samples = self.get_header_sample_list()
 2479            if list_samples:
 2480                samples_fields = " , FORMAT , " + " , ".join(list_samples)
 2481            else:
 2482                samples_fields = ""
 2483            log.debug(f"samples_fields: {samples_fields}")
 2484        else:
 2485            samples_fields = ""
 2486
 2487        # Where clause
 2488        if where_clause is None:
 2489            where_clause = ""
 2490
 2491        # Variants
 2492        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
 2493        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
 2494        log.debug(f"sql_query_select={sql_query_select}")
 2495
 2496        return self.export_output(
 2497            output_file=vcf_file,
 2498            output_header=None,
 2499            export_header=True,
 2500            query=sql_query_select,
 2501            parquet_partitions=None,
 2502            chunk_size=config.get("chunk_size", None),
 2503            threads=threads,
 2504            sort=True,
 2505            index=index,
 2506            order_by=None,
 2507        )
 2508
 2509    def run_commands(self, commands: list = [], threads: int = 1) -> None:
 2510        """
 2511        It takes a list of commands and runs them in parallel using the number of threads specified
 2512
 2513        :param commands: A list of commands to run
 2514        :param threads: The number of threads to use, defaults to 1 (optional)
 2515        """
 2516
 2517        run_parallel_commands(commands, threads)
 2518
 2519    def get_threads(self, default: int = 1) -> int:
 2520        """
 2521        This function returns the number of threads to use for a job, with a default value of 1 if not
 2522        specified.
 2523
 2524        :param default: The `default` parameter in the `get_threads` method is used to specify the
 2525        default number of threads to use if no specific value is provided. If no value is provided for
 2526        the `threads` parameter in the configuration or input parameters, the `default` value will be
 2527        used, defaults to 1
 2528        :type default: int (optional)
 2529        :return: the number of threads to use for the current job.
 2530        """
 2531
 2532        # Config
 2533        config = self.get_config()
 2534
 2535        # Param
 2536        param = self.get_param()
 2537
 2538        # Input threads
 2539        input_thread = param.get("threads", config.get("threads", None))
 2540
 2541        # Check threads
 2542        if not input_thread:
 2543            threads = default
 2544        elif int(input_thread) <= 0:
 2545            threads = os.cpu_count()
 2546        else:
 2547            threads = int(input_thread)
 2548        return threads
 2549
 2550    def get_memory(self, default: str = None) -> str:
 2551        """
 2552        This function retrieves the memory value from parameters or configuration with a default value
 2553        if not found.
 2554
 2555        :param default: The `get_memory` function takes in a default value as a string parameter. This
 2556        default value is used as a fallback in case the `memory` parameter is not provided in the
 2557        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
 2558        the function
 2559        :type default: str
 2560        :return: The `get_memory` function returns a string value representing the memory parameter. If
 2561        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
 2562        return the default value provided as an argument to the function.
 2563        """
 2564
 2565        # Config
 2566        config = self.get_config()
 2567
 2568        # Param
 2569        param = self.get_param()
 2570
 2571        # Input threads
 2572        input_memory = param.get("memory", config.get("memory", None))
 2573
 2574        # Check threads
 2575        if input_memory:
 2576            memory = input_memory
 2577        else:
 2578            memory = default
 2579
 2580        return memory
 2581
 2582    def update_from_vcf(self, vcf_file: str) -> None:
 2583        """
 2584        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
 2585
 2586        :param vcf_file: the path to the VCF file
 2587        """
 2588
 2589        connexion_format = self.get_connexion_format()
 2590
 2591        if connexion_format in ["duckdb"]:
 2592            self.update_from_vcf_duckdb(vcf_file)
 2593        elif connexion_format in ["sqlite"]:
 2594            self.update_from_vcf_sqlite(vcf_file)
 2595
 2596    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
 2597        """
 2598        It takes a VCF file and updates the INFO column of the variants table in the database with the
 2599        INFO column of the VCF file
 2600
 2601        :param vcf_file: the path to the VCF file
 2602        """
 2603
 2604        # varaints table
 2605        table_variants = self.get_table_variants()
 2606
 2607        # Loading VCF into temporaire table
 2608        skip = self.get_header_length(file=vcf_file)
 2609        vcf_df = pd.read_csv(
 2610            vcf_file,
 2611            sep="\t",
 2612            engine="c",
 2613            skiprows=skip,
 2614            header=0,
 2615            low_memory=False,
 2616        )
 2617        sql_query_update = f"""
 2618        UPDATE {table_variants} as table_variants
 2619            SET INFO = concat(
 2620                            CASE
 2621                                WHEN INFO NOT IN ('', '.')
 2622                                THEN INFO
 2623                                ELSE ''
 2624                            END,
 2625                            (
 2626                                SELECT 
 2627                                    concat(
 2628                                        CASE
 2629                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
 2630                                            THEN ';'
 2631                                            ELSE ''
 2632                                        END
 2633                                        ,
 2634                                        CASE
 2635                                            WHEN table_parquet.INFO NOT IN ('','.')
 2636                                            THEN table_parquet.INFO
 2637                                            ELSE ''
 2638                                        END
 2639                                    )
 2640                                FROM vcf_df as table_parquet
 2641                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
 2642                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
 2643                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 2644                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
 2645                                        AND table_parquet.INFO NOT IN ('','.')
 2646                            )
 2647                        )
 2648            ;
 2649            """
 2650        self.conn.execute(sql_query_update)
 2651
 2652    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
 2653        """
 2654        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
 2655        table, then updates the INFO column of the variants table with the INFO column of the temporary
 2656        table
 2657
 2658        :param vcf_file: The path to the VCF file you want to update the database with
 2659        """
 2660
 2661        # Create a temporary table for the VCF
 2662        table_vcf = "tmp_vcf"
 2663        sql_create = (
 2664            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
 2665        )
 2666        self.conn.execute(sql_create)
 2667
 2668        # Loading VCF into temporaire table
 2669        vcf_df = pd.read_csv(
 2670            vcf_file, sep="\t", comment="#", header=None, low_memory=False
 2671        )
 2672        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
 2673        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
 2674
 2675        # Update table 'variants' with VCF data
 2676        # warning: CONCAT as || operator
 2677        sql_query_update = f"""
 2678            UPDATE variants as table_variants
 2679            SET INFO = CASE
 2680                            WHEN INFO NOT IN ('', '.')
 2681                            THEN INFO
 2682                            ELSE ''
 2683                        END ||
 2684                        (
 2685                        SELECT 
 2686                            CASE 
 2687                                WHEN table_variants.INFO NOT IN ('','.') 
 2688                                    AND table_vcf.INFO NOT IN ('','.')  
 2689                                THEN ';' 
 2690                                ELSE '' 
 2691                            END || 
 2692                            CASE 
 2693                                WHEN table_vcf.INFO NOT IN ('','.') 
 2694                                THEN table_vcf.INFO 
 2695                                ELSE '' 
 2696                            END
 2697                        FROM {table_vcf} as table_vcf
 2698                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
 2699                            AND table_vcf.\"POS\" = table_variants.\"POS\"
 2700                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
 2701                            AND table_vcf.\"REF\" = table_variants.\"REF\"
 2702                        )
 2703        """
 2704        self.conn.execute(sql_query_update)
 2705
 2706        # Drop temporary table
 2707        sql_drop = f"DROP TABLE {table_vcf}"
 2708        self.conn.execute(sql_drop)
 2709
 2710    def drop_variants_table(self) -> None:
 2711        """
 2712        > This function drops the variants table
 2713        """
 2714
 2715        table_variants = self.get_table_variants()
 2716        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
 2717        self.conn.execute(sql_table_variants)
 2718
 2719    def set_variant_id(
 2720        self, variant_id_column: str = "variant_id", force: bool = None
 2721    ) -> str:
 2722        """
 2723        It adds a column to the variants table called `variant_id` and populates it with a hash of the
 2724        `#CHROM`, `POS`, `REF`, and `ALT` columns
 2725
 2726        :param variant_id_column: The name of the column to be created in the variants table, defaults
 2727        to variant_id
 2728        :type variant_id_column: str (optional)
 2729        :param force: If True, the variant_id column will be created even if it already exists
 2730        :type force: bool
 2731        :return: The name of the column that contains the variant_id
 2732        """
 2733
 2734        # Assembly
 2735        assembly = self.get_param().get(
 2736            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 2737        )
 2738
 2739        # INFO/Tag prefix
 2740        prefix = self.get_explode_infos_prefix()
 2741
 2742        # Explode INFO/SVTYPE
 2743        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
 2744
 2745        # variants table
 2746        table_variants = self.get_table_variants()
 2747
 2748        # variant_id column
 2749        if not variant_id_column:
 2750            variant_id_column = "variant_id"
 2751
 2752        # Creta variant_id column
 2753        if "variant_id" not in self.get_extra_infos() or force:
 2754
 2755            # Create column
 2756            self.add_column(
 2757                table_name=table_variants,
 2758                column_name=variant_id_column,
 2759                column_type="UBIGINT",
 2760                default_value="0",
 2761            )
 2762
 2763            # Update column
 2764            self.conn.execute(
 2765                f"""
 2766                    UPDATE {table_variants}
 2767                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
 2768                """
 2769            )
 2770
 2771        # Remove added columns
 2772        for added_column in added_columns:
 2773            self.drop_column(column=added_column)
 2774
 2775        # return variant_id column name
 2776        return variant_id_column
 2777
 2778    def get_variant_id_column(
 2779        self, variant_id_column: str = "variant_id", force: bool = None
 2780    ) -> str:
 2781        """
 2782        This function returns the variant_id column name
 2783
 2784        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
 2785        defaults to variant_id
 2786        :type variant_id_column: str (optional)
 2787        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
 2788        False, will only set the variant_id if it is not already set. If None, will set the variant_id
 2789        if it is not already set, or if it is set
 2790        :type force: bool
 2791        :return: The variant_id column name.
 2792        """
 2793
 2794        return self.set_variant_id(variant_id_column=variant_id_column, force=force)
 2795
 2796    ###
 2797    # Annotation
 2798    ###
 2799
 2800    def scan_databases(
 2801        self,
 2802        database_formats: list = ["parquet"],
 2803        database_releases: list = ["current"],
 2804    ) -> dict:
 2805        """
 2806        The function `scan_databases` scans for available databases based on specified formats and
 2807        releases.
 2808
 2809        :param database_formats: The `database_formats` parameter is a list that specifies the formats
 2810        of the databases to be scanned. In this case, the accepted format is "parquet"
 2811        :type database_formats: list ["parquet"]
 2812        :param database_releases: The `database_releases` parameter is a list that specifies the
 2813        releases of the databases to be scanned. In the provided function, the default value for
 2814        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
 2815        databases that are in the "current"
 2816        :type database_releases: list
 2817        :return: The function `scan_databases` returns a dictionary containing information about
 2818        databases that match the specified formats and releases.
 2819        """
 2820
 2821        # Config
 2822        config = self.get_config()
 2823
 2824        # Param
 2825        param = self.get_param()
 2826
 2827        # Param - Assembly
 2828        assembly = param.get("assembly", config.get("assembly", None))
 2829        if not assembly:
 2830            assembly = DEFAULT_ASSEMBLY
 2831            log.warning(f"Default assembly '{assembly}'")
 2832
 2833        # Scan for availabled databases
 2834        log.info(
 2835            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
 2836        )
 2837        databases_infos_dict = databases_infos(
 2838            database_folder_releases=database_releases,
 2839            database_formats=database_formats,
 2840            assembly=assembly,
 2841            config=config,
 2842        )
 2843        log.info(
 2844            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
 2845        )
 2846
 2847        return databases_infos_dict
 2848
 2849    def annotation(self) -> None:
 2850        """
 2851        It annotates the VCF file with the annotations specified in the config file.
 2852        """
 2853
 2854        # Config
 2855        config = self.get_config()
 2856
 2857        # Param
 2858        param = self.get_param()
 2859
 2860        # Param - Assembly
 2861        assembly = param.get("assembly", config.get("assembly", None))
 2862        if not assembly:
 2863            assembly = DEFAULT_ASSEMBLY
 2864            log.warning(f"Default assembly '{assembly}'")
 2865
 2866        # annotations databases folders
 2867        annotations_databases = set(
 2868            config.get("folders", {})
 2869            .get("databases", {})
 2870            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
 2871            + config.get("folders", {})
 2872            .get("databases", {})
 2873            .get("parquet", ["~/howard/databases/parquet/current"])
 2874            + config.get("folders", {})
 2875            .get("databases", {})
 2876            .get("bcftools", ["~/howard/databases/bcftools/current"])
 2877        )
 2878
 2879        # Get param annotations
 2880        if param.get("annotations", None) and isinstance(
 2881            param.get("annotations", None), str
 2882        ):
 2883            log.debug(param.get("annotations", None))
 2884            param_annotation_list = param.get("annotations").split(",")
 2885        else:
 2886            param_annotation_list = []
 2887
 2888        # Each tools param
 2889        if param.get("annotation_parquet", None) != None:
 2890            log.debug(
 2891                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
 2892            )
 2893            if isinstance(param.get("annotation_parquet", None), list):
 2894                param_annotation_list.append(",".join(param.get("annotation_parquet")))
 2895            else:
 2896                param_annotation_list.append(param.get("annotation_parquet"))
 2897        if param.get("annotation_snpsift", None) != None:
 2898            if isinstance(param.get("annotation_snpsift", None), list):
 2899                param_annotation_list.append(
 2900                    "snpsift:"
 2901                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
 2902                )
 2903            else:
 2904                param_annotation_list.append(
 2905                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
 2906                )
 2907        if param.get("annotation_snpeff", None) != None:
 2908            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
 2909        if param.get("annotation_bcftools", None) != None:
 2910            if isinstance(param.get("annotation_bcftools", None), list):
 2911                param_annotation_list.append(
 2912                    "bcftools:"
 2913                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
 2914                )
 2915            else:
 2916                param_annotation_list.append(
 2917                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
 2918                )
 2919        if param.get("annotation_annovar", None) != None:
 2920            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
 2921        if param.get("annotation_exomiser", None) != None:
 2922            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
 2923        if param.get("annotation_splice", None) != None:
 2924            param_annotation_list.append("splice:" + param.get("annotation_splice"))
 2925
 2926        # Merge param annotations list
 2927        param["annotations"] = ",".join(param_annotation_list)
 2928
 2929        # debug
 2930        log.debug(f"param_annotations={param['annotations']}")
 2931
 2932        if param.get("annotations"):
 2933
 2934            # Log
 2935            # log.info("Annotations - Check annotation parameters")
 2936
 2937            if not "annotation" in param:
 2938                param["annotation"] = {}
 2939
 2940            # List of annotations parameters
 2941            annotations_list_input = {}
 2942            if isinstance(param.get("annotations", None), str):
 2943                annotation_file_list = [
 2944                    value for value in param.get("annotations", "").split(",")
 2945                ]
 2946                for annotation_file in annotation_file_list:
 2947                    annotations_list_input[annotation_file] = {"INFO": None}
 2948            else:
 2949                annotations_list_input = param.get("annotations", {})
 2950
 2951            log.info(f"Quick Annotations:")
 2952            for annotation_key in list(annotations_list_input.keys()):
 2953                log.info(f"   {annotation_key}")
 2954
 2955            # List of annotations and associated fields
 2956            annotations_list = {}
 2957
 2958            for annotation_file in annotations_list_input:
 2959
 2960                # Explode annotations if ALL
 2961                if (
 2962                    annotation_file.upper() == "ALL"
 2963                    or annotation_file.upper().startswith("ALL:")
 2964                ):
 2965
 2966                    # check ALL parameters (formats, releases)
 2967                    annotation_file_split = annotation_file.split(":")
 2968                    database_formats = "parquet"
 2969                    database_releases = "current"
 2970                    for annotation_file_option in annotation_file_split[1:]:
 2971                        database_all_options_split = annotation_file_option.split("=")
 2972                        if database_all_options_split[0] == "format":
 2973                            database_formats = database_all_options_split[1].split("+")
 2974                        if database_all_options_split[0] == "release":
 2975                            database_releases = database_all_options_split[1].split("+")
 2976
 2977                    # Scan for availabled databases
 2978                    databases_infos_dict = self.scan_databases(
 2979                        database_formats=database_formats,
 2980                        database_releases=database_releases,
 2981                    )
 2982
 2983                    # Add found databases in annotation parameters
 2984                    for database_infos in databases_infos_dict.keys():
 2985                        annotations_list[database_infos] = {"INFO": None}
 2986
 2987                else:
 2988                    annotations_list[annotation_file] = annotations_list_input[
 2989                        annotation_file
 2990                    ]
 2991
 2992            # Check each databases
 2993            if len(annotations_list):
 2994
 2995                log.info(
 2996                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
 2997                )
 2998
 2999                for annotation_file in annotations_list:
 3000
 3001                    # Init
 3002                    annotations = annotations_list.get(annotation_file, None)
 3003
 3004                    # Annotation snpEff
 3005                    if annotation_file.startswith("snpeff"):
 3006
 3007                        log.debug(f"Quick Annotation snpEff")
 3008
 3009                        if "snpeff" not in param["annotation"]:
 3010                            param["annotation"]["snpeff"] = {}
 3011
 3012                        if "options" not in param["annotation"]["snpeff"]:
 3013                            param["annotation"]["snpeff"]["options"] = ""
 3014
 3015                        # snpEff options in annotations
 3016                        param["annotation"]["snpeff"]["options"] = "".join(
 3017                            annotation_file.split(":")[1:]
 3018                        )
 3019
 3020                    # Annotation Annovar
 3021                    elif annotation_file.startswith("annovar"):
 3022
 3023                        log.debug(f"Quick Annotation Annovar")
 3024
 3025                        if "annovar" not in param["annotation"]:
 3026                            param["annotation"]["annovar"] = {}
 3027
 3028                        if "annotations" not in param["annotation"]["annovar"]:
 3029                            param["annotation"]["annovar"]["annotations"] = {}
 3030
 3031                        # Options
 3032                        annotation_file_split = annotation_file.split(":")
 3033                        for annotation_file_annotation in annotation_file_split[1:]:
 3034                            if annotation_file_annotation:
 3035                                param["annotation"]["annovar"]["annotations"][
 3036                                    annotation_file_annotation
 3037                                ] = annotations
 3038
 3039                    # Annotation Exomiser
 3040                    elif annotation_file.startswith("exomiser"):
 3041
 3042                        log.debug(f"Quick Annotation Exomiser")
 3043
 3044                        param["annotation"]["exomiser"] = params_string_to_dict(
 3045                            annotation_file
 3046                        )
 3047
 3048                    # Annotation Splice
 3049                    elif annotation_file.startswith("splice"):
 3050
 3051                        log.debug(f"Quick Annotation Splice")
 3052
 3053                        param["annotation"]["splice"] = params_string_to_dict(
 3054                            annotation_file
 3055                        )
 3056
 3057                    # Annotation Parquet or BCFTOOLS
 3058                    else:
 3059
 3060                        # Tools detection
 3061                        if annotation_file.startswith("bcftools:"):
 3062                            annotation_tool_initial = "bcftools"
 3063                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3064                        elif annotation_file.startswith("snpsift:"):
 3065                            annotation_tool_initial = "snpsift"
 3066                            annotation_file = ":".join(annotation_file.split(":")[1:])
 3067                        else:
 3068                            annotation_tool_initial = None
 3069
 3070                        # list of files
 3071                        annotation_file_list = annotation_file.replace("+", ":").split(
 3072                            ":"
 3073                        )
 3074
 3075                        for annotation_file in annotation_file_list:
 3076
 3077                            if annotation_file:
 3078
 3079                                # Annotation tool initial
 3080                                annotation_tool = annotation_tool_initial
 3081
 3082                                # Find file
 3083                                annotation_file_found = None
 3084
 3085                                # Expand user
 3086                                annotation_file = full_path(annotation_file)
 3087
 3088                                if os.path.exists(annotation_file):
 3089                                    annotation_file_found = annotation_file
 3090
 3091                                else:
 3092                                    # Find within assembly folders
 3093                                    for annotations_database in annotations_databases:
 3094                                        found_files = find_all(
 3095                                            annotation_file,
 3096                                            os.path.join(
 3097                                                annotations_database, assembly
 3098                                            ),
 3099                                        )
 3100                                        if len(found_files) > 0:
 3101                                            annotation_file_found = found_files[0]
 3102                                            break
 3103                                    if not annotation_file_found and not assembly:
 3104                                        # Find within folders
 3105                                        for (
 3106                                            annotations_database
 3107                                        ) in annotations_databases:
 3108                                            found_files = find_all(
 3109                                                annotation_file, annotations_database
 3110                                            )
 3111                                            if len(found_files) > 0:
 3112                                                annotation_file_found = found_files[0]
 3113                                                break
 3114                                log.debug(
 3115                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
 3116                                )
 3117
 3118                                # Full path
 3119                                annotation_file_found = full_path(annotation_file_found)
 3120
 3121                                if annotation_file_found:
 3122
 3123                                    database = Database(database=annotation_file_found)
 3124                                    quick_annotation_format = database.get_format()
 3125                                    quick_annotation_is_compressed = (
 3126                                        database.is_compressed()
 3127                                    )
 3128                                    quick_annotation_is_indexed = os.path.exists(
 3129                                        f"{annotation_file_found}.tbi"
 3130                                    )
 3131                                    bcftools_preference = False
 3132
 3133                                    # Check Annotation Tool
 3134                                    if not annotation_tool:
 3135                                        if (
 3136                                            bcftools_preference
 3137                                            and quick_annotation_format
 3138                                            in ["vcf", "bed"]
 3139                                            and quick_annotation_is_compressed
 3140                                            and quick_annotation_is_indexed
 3141                                        ):
 3142                                            annotation_tool = "bcftools"
 3143                                        elif quick_annotation_format in [
 3144                                            "vcf",
 3145                                            "bed",
 3146                                            "tsv",
 3147                                            "tsv",
 3148                                            "csv",
 3149                                            "json",
 3150                                            "tbl",
 3151                                            "parquet",
 3152                                            "duckdb",
 3153                                        ]:
 3154                                            annotation_tool = "parquet"
 3155                                        else:
 3156                                            log.error(
 3157                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3158                                            )
 3159                                            raise ValueError(
 3160                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
 3161                                            )
 3162
 3163                                    log.debug(
 3164                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
 3165                                    )
 3166
 3167                                    # Annotation Tool dispatch
 3168                                    if annotation_tool:
 3169                                        if annotation_tool not in param["annotation"]:
 3170                                            param["annotation"][annotation_tool] = {}
 3171                                        if (
 3172                                            "annotations"
 3173                                            not in param["annotation"][annotation_tool]
 3174                                        ):
 3175                                            param["annotation"][annotation_tool][
 3176                                                "annotations"
 3177                                            ] = {}
 3178                                        param["annotation"][annotation_tool][
 3179                                            "annotations"
 3180                                        ][annotation_file_found] = annotations
 3181
 3182                                else:
 3183                                    log.error(
 3184                                        f"Quick Annotation File {annotation_file} does NOT exist"
 3185                                    )
 3186
 3187                self.set_param(param)
 3188
 3189        if param.get("annotation", None):
 3190            log.info("Annotations")
 3191            if param.get("annotation", {}).get("parquet", None):
 3192                log.info("Annotations 'parquet'...")
 3193                self.annotation_parquet()
 3194            if param.get("annotation", {}).get("bcftools", None):
 3195                log.info("Annotations 'bcftools'...")
 3196                self.annotation_bcftools()
 3197            if param.get("annotation", {}).get("snpsift", None):
 3198                log.info("Annotations 'snpsift'...")
 3199                self.annotation_snpsift()
 3200            if param.get("annotation", {}).get("annovar", None):
 3201                log.info("Annotations 'annovar'...")
 3202                self.annotation_annovar()
 3203            if param.get("annotation", {}).get("snpeff", None):
 3204                log.info("Annotations 'snpeff'...")
 3205                self.annotation_snpeff()
 3206            if param.get("annotation", {}).get("exomiser", None) is not None:
 3207                log.info("Annotations 'exomiser'...")
 3208                self.annotation_exomiser()
 3209            if param.get("annotation", {}).get("splice", None) is not None:
 3210                log.info("Annotations 'splice' ...")
 3211                self.annotation_splice()
 3212
 3213        # Explode INFOS fields into table fields
 3214        if self.get_explode_infos():
 3215            self.explode_infos(
 3216                prefix=self.get_explode_infos_prefix(),
 3217                fields=self.get_explode_infos_fields(),
 3218                force=True,
 3219            )
 3220
 3221    def annotation_snpsift(self, threads: int = None) -> None:
 3222        """
 3223        This function annotate with bcftools
 3224
 3225        :param threads: Number of threads to use
 3226        :return: the value of the variable "return_value".
 3227        """
 3228
 3229        # DEBUG
 3230        log.debug("Start annotation with bcftools databases")
 3231
 3232        # Threads
 3233        if not threads:
 3234            threads = self.get_threads()
 3235        log.debug("Threads: " + str(threads))
 3236
 3237        # Config
 3238        config = self.get_config()
 3239        log.debug("Config: " + str(config))
 3240
 3241        # Config - snpSift
 3242        snpsift_bin_command = get_bin_command(
 3243            bin="SnpSift.jar",
 3244            tool="snpsift",
 3245            bin_type="jar",
 3246            config=config,
 3247            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 3248        )
 3249        if not snpsift_bin_command:
 3250            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
 3251            log.error(msg_err)
 3252            raise ValueError(msg_err)
 3253
 3254        # Config - bcftools
 3255        bcftools_bin_command = get_bin_command(
 3256            bin="bcftools",
 3257            tool="bcftools",
 3258            bin_type="bin",
 3259            config=config,
 3260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3261        )
 3262        if not bcftools_bin_command:
 3263            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3264            log.error(msg_err)
 3265            raise ValueError(msg_err)
 3266
 3267        # Config - BCFTools databases folders
 3268        databases_folders = set(
 3269            self.get_config()
 3270            .get("folders", {})
 3271            .get("databases", {})
 3272            .get("annotations", ["."])
 3273            + self.get_config()
 3274            .get("folders", {})
 3275            .get("databases", {})
 3276            .get("bcftools", ["."])
 3277        )
 3278        log.debug("Databases annotations: " + str(databases_folders))
 3279
 3280        # Param
 3281        annotations = (
 3282            self.get_param()
 3283            .get("annotation", {})
 3284            .get("snpsift", {})
 3285            .get("annotations", None)
 3286        )
 3287        log.debug("Annotations: " + str(annotations))
 3288
 3289        # Assembly
 3290        assembly = self.get_param().get(
 3291            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3292        )
 3293
 3294        # Data
 3295        table_variants = self.get_table_variants()
 3296
 3297        # Check if not empty
 3298        log.debug("Check if not empty")
 3299        sql_query_chromosomes = (
 3300            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3301        )
 3302        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3303        if not sql_query_chromosomes_df["count"][0]:
 3304            log.info(f"VCF empty")
 3305            return
 3306
 3307        # VCF header
 3308        vcf_reader = self.get_header()
 3309        log.debug("Initial header: " + str(vcf_reader.infos))
 3310
 3311        # Existing annotations
 3312        for vcf_annotation in self.get_header().infos:
 3313
 3314            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3315            log.debug(
 3316                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3317            )
 3318
 3319        if annotations:
 3320
 3321            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 3322
 3323                # Export VCF file
 3324                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
 3325
 3326                # Init
 3327                commands = {}
 3328
 3329                for annotation in annotations:
 3330                    annotation_fields = annotations[annotation]
 3331
 3332                    # Annotation Name
 3333                    annotation_name = os.path.basename(annotation)
 3334
 3335                    if not annotation_fields:
 3336                        annotation_fields = {"INFO": None}
 3337
 3338                    log.debug(f"Annotation '{annotation_name}'")
 3339                    log.debug(
 3340                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3341                    )
 3342
 3343                    # Create Database
 3344                    database = Database(
 3345                        database=annotation,
 3346                        databases_folders=databases_folders,
 3347                        assembly=assembly,
 3348                    )
 3349
 3350                    # Find files
 3351                    db_file = database.get_database()
 3352                    db_file = full_path(db_file)
 3353                    db_hdr_file = database.get_header_file()
 3354                    db_hdr_file = full_path(db_hdr_file)
 3355                    db_file_type = database.get_format()
 3356                    db_tbi_file = f"{db_file}.tbi"
 3357                    db_file_compressed = database.is_compressed()
 3358
 3359                    # Check if compressed
 3360                    if not db_file_compressed:
 3361                        log.error(
 3362                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3363                        )
 3364                        raise ValueError(
 3365                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3366                        )
 3367
 3368                    # Check if indexed
 3369                    if not os.path.exists(db_tbi_file):
 3370                        log.error(
 3371                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3372                        )
 3373                        raise ValueError(
 3374                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3375                        )
 3376
 3377                    # Check index - try to create if not exists
 3378                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3379                        log.error("Annotation failed: database not valid")
 3380                        log.error(f"Annotation annotation file: {db_file}")
 3381                        log.error(f"Annotation annotation header: {db_hdr_file}")
 3382                        log.error(f"Annotation annotation index: {db_tbi_file}")
 3383                        raise ValueError(
 3384                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3385                        )
 3386                    else:
 3387
 3388                        log.debug(
 3389                            f"Annotation '{annotation}' - file: "
 3390                            + str(db_file)
 3391                            + " and "
 3392                            + str(db_hdr_file)
 3393                        )
 3394
 3395                        # Load header as VCF object
 3396                        db_hdr_vcf = Variants(input=db_hdr_file)
 3397                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3398                        log.debug(
 3399                            "Annotation database header: "
 3400                            + str(db_hdr_vcf_header_infos)
 3401                        )
 3402
 3403                        # For all fields in database
 3404                        annotation_fields_full = False
 3405                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3406                            annotation_fields = {
 3407                                key: key for key in db_hdr_vcf_header_infos
 3408                            }
 3409                            log.debug(
 3410                                "Annotation database header - All annotations added: "
 3411                                + str(annotation_fields)
 3412                            )
 3413                            annotation_fields_full = True
 3414
 3415                        # # Create file for field rename
 3416                        # log.debug("Create file for field rename")
 3417                        # tmp_rename = NamedTemporaryFile(
 3418                        #     prefix=self.get_prefix(),
 3419                        #     dir=self.get_tmp_dir(),
 3420                        #     suffix=".rename",
 3421                        #     delete=False,
 3422                        # )
 3423                        # tmp_rename_name = tmp_rename.name
 3424                        # tmp_files.append(tmp_rename_name)
 3425
 3426                        # Number of fields
 3427                        nb_annotation_field = 0
 3428                        annotation_list = []
 3429                        annotation_infos_rename_list = []
 3430
 3431                        for annotation_field in annotation_fields:
 3432
 3433                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3434                            annotation_fields_new_name = annotation_fields.get(
 3435                                annotation_field, annotation_field
 3436                            )
 3437                            if not annotation_fields_new_name:
 3438                                annotation_fields_new_name = annotation_field
 3439
 3440                            # Check if field is in DB and if field is not elready in input data
 3441                            if (
 3442                                annotation_field in db_hdr_vcf.get_header().infos
 3443                                and annotation_fields_new_name
 3444                                not in self.get_header().infos
 3445                            ):
 3446
 3447                                log.info(
 3448                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3449                                )
 3450
 3451                                # BCFTools annotate param to rename fields
 3452                                if annotation_field != annotation_fields_new_name:
 3453                                    annotation_infos_rename_list.append(
 3454                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3455                                    )
 3456
 3457                                # Add INFO field to header
 3458                                db_hdr_vcf_header_infos_number = (
 3459                                    db_hdr_vcf_header_infos[annotation_field].num or "."
 3460                                )
 3461                                db_hdr_vcf_header_infos_type = (
 3462                                    db_hdr_vcf_header_infos[annotation_field].type
 3463                                    or "String"
 3464                                )
 3465                                db_hdr_vcf_header_infos_description = (
 3466                                    db_hdr_vcf_header_infos[annotation_field].desc
 3467                                    or f"{annotation_field} description"
 3468                                )
 3469                                db_hdr_vcf_header_infos_source = (
 3470                                    db_hdr_vcf_header_infos[annotation_field].source
 3471                                    or "unknown"
 3472                                )
 3473                                db_hdr_vcf_header_infos_version = (
 3474                                    db_hdr_vcf_header_infos[annotation_field].version
 3475                                    or "unknown"
 3476                                )
 3477
 3478                                vcf_reader.infos[annotation_fields_new_name] = (
 3479                                    vcf.parser._Info(
 3480                                        annotation_fields_new_name,
 3481                                        db_hdr_vcf_header_infos_number,
 3482                                        db_hdr_vcf_header_infos_type,
 3483                                        db_hdr_vcf_header_infos_description,
 3484                                        db_hdr_vcf_header_infos_source,
 3485                                        db_hdr_vcf_header_infos_version,
 3486                                        self.code_type_map[
 3487                                            db_hdr_vcf_header_infos_type
 3488                                        ],
 3489                                    )
 3490                                )
 3491
 3492                                annotation_list.append(annotation_field)
 3493
 3494                                nb_annotation_field += 1
 3495
 3496                            else:
 3497
 3498                                if (
 3499                                    annotation_field
 3500                                    not in db_hdr_vcf.get_header().infos
 3501                                ):
 3502                                    log.warning(
 3503                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
 3504                                    )
 3505                                if (
 3506                                    annotation_fields_new_name
 3507                                    in self.get_header().infos
 3508                                ):
 3509                                    log.warning(
 3510                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3511                                    )
 3512
 3513                        log.info(
 3514                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3515                        )
 3516
 3517                        annotation_infos = ",".join(annotation_list)
 3518
 3519                        if annotation_infos != "":
 3520
 3521                            # Annotated VCF (and error file)
 3522                            tmp_annotation_vcf_name = os.path.join(
 3523                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
 3524                            )
 3525                            tmp_annotation_vcf_name_err = (
 3526                                tmp_annotation_vcf_name + ".err"
 3527                            )
 3528
 3529                            # Add fields to annotate
 3530                            if not annotation_fields_full:
 3531                                annotation_infos_option = f"-info {annotation_infos}"
 3532                            else:
 3533                                annotation_infos_option = ""
 3534
 3535                            # Info fields rename
 3536                            if annotation_infos_rename_list:
 3537                                annotation_infos_rename = " -c " + ",".join(
 3538                                    annotation_infos_rename_list
 3539                                )
 3540                            else:
 3541                                annotation_infos_rename = ""
 3542
 3543                            # Annotate command
 3544                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3545
 3546                            # Add command
 3547                            commands[command_annotate] = tmp_annotation_vcf_name
 3548
 3549                if commands:
 3550
 3551                    # Export VCF file
 3552                    self.export_variant_vcf(
 3553                        vcf_file=tmp_vcf_name,
 3554                        remove_info=True,
 3555                        add_samples=False,
 3556                        index=True,
 3557                    )
 3558                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
 3559
 3560                    # Num command
 3561                    nb_command = 0
 3562
 3563                    # Annotate
 3564                    for command_annotate in commands:
 3565                        nb_command += 1
 3566                        log.info(
 3567                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
 3568                        )
 3569                        log.debug(f"command_annotate={command_annotate}")
 3570                        run_parallel_commands([command_annotate], threads)
 3571
 3572                        # Debug
 3573                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
 3574
 3575                        # Update variants
 3576                        log.info(
 3577                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
 3578                        )
 3579                        self.update_from_vcf(commands[command_annotate])
 3580
 3581    def annotation_bcftools(self, threads: int = None) -> None:
 3582        """
 3583        This function annotate with bcftools
 3584
 3585        :param threads: Number of threads to use
 3586        :return: the value of the variable "return_value".
 3587        """
 3588
 3589        # DEBUG
 3590        log.debug("Start annotation with bcftools databases")
 3591
 3592        # Threads
 3593        if not threads:
 3594            threads = self.get_threads()
 3595        log.debug("Threads: " + str(threads))
 3596
 3597        # Config
 3598        config = self.get_config()
 3599        log.debug("Config: " + str(config))
 3600
 3601        # DEBUG
 3602        delete_tmp = True
 3603        if self.get_config().get("verbosity", "warning") in ["debug"]:
 3604            delete_tmp = False
 3605            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 3606
 3607        # Config - BCFTools bin command
 3608        bcftools_bin_command = get_bin_command(
 3609            bin="bcftools",
 3610            tool="bcftools",
 3611            bin_type="bin",
 3612            config=config,
 3613            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 3614        )
 3615        if not bcftools_bin_command:
 3616            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 3617            log.error(msg_err)
 3618            raise ValueError(msg_err)
 3619
 3620        # Config - BCFTools databases folders
 3621        databases_folders = set(
 3622            self.get_config()
 3623            .get("folders", {})
 3624            .get("databases", {})
 3625            .get("annotations", ["."])
 3626            + self.get_config()
 3627            .get("folders", {})
 3628            .get("databases", {})
 3629            .get("bcftools", ["."])
 3630        )
 3631        log.debug("Databases annotations: " + str(databases_folders))
 3632
 3633        # Param
 3634        annotations = (
 3635            self.get_param()
 3636            .get("annotation", {})
 3637            .get("bcftools", {})
 3638            .get("annotations", None)
 3639        )
 3640        log.debug("Annotations: " + str(annotations))
 3641
 3642        # Assembly
 3643        assembly = self.get_param().get(
 3644            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 3645        )
 3646
 3647        # Data
 3648        table_variants = self.get_table_variants()
 3649
 3650        # Check if not empty
 3651        log.debug("Check if not empty")
 3652        sql_query_chromosomes = (
 3653            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 3654        )
 3655        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 3656        if not sql_query_chromosomes_df["count"][0]:
 3657            log.info(f"VCF empty")
 3658            return
 3659
 3660        # Export in VCF
 3661        log.debug("Create initial file to annotate")
 3662        tmp_vcf = NamedTemporaryFile(
 3663            prefix=self.get_prefix(),
 3664            dir=self.get_tmp_dir(),
 3665            suffix=".vcf.gz",
 3666            delete=False,
 3667        )
 3668        tmp_vcf_name = tmp_vcf.name
 3669
 3670        # VCF header
 3671        vcf_reader = self.get_header()
 3672        log.debug("Initial header: " + str(vcf_reader.infos))
 3673
 3674        # Existing annotations
 3675        for vcf_annotation in self.get_header().infos:
 3676
 3677            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 3678            log.debug(
 3679                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 3680            )
 3681
 3682        if annotations:
 3683
 3684            tmp_ann_vcf_list = []
 3685            commands = []
 3686            tmp_files = []
 3687            err_files = []
 3688
 3689            for annotation in annotations:
 3690                annotation_fields = annotations[annotation]
 3691
 3692                # Annotation Name
 3693                annotation_name = os.path.basename(annotation)
 3694
 3695                if not annotation_fields:
 3696                    annotation_fields = {"INFO": None}
 3697
 3698                log.debug(f"Annotation '{annotation_name}'")
 3699                log.debug(
 3700                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 3701                )
 3702
 3703                # Create Database
 3704                database = Database(
 3705                    database=annotation,
 3706                    databases_folders=databases_folders,
 3707                    assembly=assembly,
 3708                )
 3709
 3710                # Find files
 3711                db_file = database.get_database()
 3712                db_file = full_path(db_file)
 3713                db_hdr_file = database.get_header_file()
 3714                db_hdr_file = full_path(db_hdr_file)
 3715                db_file_type = database.get_format()
 3716                db_tbi_file = f"{db_file}.tbi"
 3717                db_file_compressed = database.is_compressed()
 3718
 3719                # Check if compressed
 3720                if not db_file_compressed:
 3721                    log.error(
 3722                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3723                    )
 3724                    raise ValueError(
 3725                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
 3726                    )
 3727
 3728                # Check if indexed
 3729                if not os.path.exists(db_tbi_file):
 3730                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
 3731                    raise ValueError(
 3732                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
 3733                    )
 3734
 3735                # Check index - try to create if not exists
 3736                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
 3737                    log.error("Annotation failed: database not valid")
 3738                    log.error(f"Annotation annotation file: {db_file}")
 3739                    log.error(f"Annotation annotation header: {db_hdr_file}")
 3740                    log.error(f"Annotation annotation index: {db_tbi_file}")
 3741                    raise ValueError(
 3742                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
 3743                    )
 3744                else:
 3745
 3746                    log.debug(
 3747                        f"Annotation '{annotation}' - file: "
 3748                        + str(db_file)
 3749                        + " and "
 3750                        + str(db_hdr_file)
 3751                    )
 3752
 3753                    # Load header as VCF object
 3754                    db_hdr_vcf = Variants(input=db_hdr_file)
 3755                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
 3756                    log.debug(
 3757                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
 3758                    )
 3759
 3760                    # For all fields in database
 3761                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 3762                        annotation_fields = {
 3763                            key: key for key in db_hdr_vcf_header_infos
 3764                        }
 3765                        log.debug(
 3766                            "Annotation database header - All annotations added: "
 3767                            + str(annotation_fields)
 3768                        )
 3769
 3770                    # Number of fields
 3771                    nb_annotation_field = 0
 3772                    annotation_list = []
 3773
 3774                    for annotation_field in annotation_fields:
 3775
 3776                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 3777                        annotation_fields_new_name = annotation_fields.get(
 3778                            annotation_field, annotation_field
 3779                        )
 3780                        if not annotation_fields_new_name:
 3781                            annotation_fields_new_name = annotation_field
 3782
 3783                        # Check if field is in DB and if field is not elready in input data
 3784                        if (
 3785                            annotation_field in db_hdr_vcf.get_header().infos
 3786                            and annotation_fields_new_name
 3787                            not in self.get_header().infos
 3788                        ):
 3789
 3790                            log.info(
 3791                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
 3792                            )
 3793
 3794                            # Add INFO field to header
 3795                            db_hdr_vcf_header_infos_number = (
 3796                                db_hdr_vcf_header_infos[annotation_field].num or "."
 3797                            )
 3798                            db_hdr_vcf_header_infos_type = (
 3799                                db_hdr_vcf_header_infos[annotation_field].type
 3800                                or "String"
 3801                            )
 3802                            db_hdr_vcf_header_infos_description = (
 3803                                db_hdr_vcf_header_infos[annotation_field].desc
 3804                                or f"{annotation_field} description"
 3805                            )
 3806                            db_hdr_vcf_header_infos_source = (
 3807                                db_hdr_vcf_header_infos[annotation_field].source
 3808                                or "unknown"
 3809                            )
 3810                            db_hdr_vcf_header_infos_version = (
 3811                                db_hdr_vcf_header_infos[annotation_field].version
 3812                                or "unknown"
 3813                            )
 3814
 3815                            vcf_reader.infos[annotation_fields_new_name] = (
 3816                                vcf.parser._Info(
 3817                                    annotation_fields_new_name,
 3818                                    db_hdr_vcf_header_infos_number,
 3819                                    db_hdr_vcf_header_infos_type,
 3820                                    db_hdr_vcf_header_infos_description,
 3821                                    db_hdr_vcf_header_infos_source,
 3822                                    db_hdr_vcf_header_infos_version,
 3823                                    self.code_type_map[db_hdr_vcf_header_infos_type],
 3824                                )
 3825                            )
 3826
 3827                            # annotation_list.append(annotation_field)
 3828                            if annotation_field != annotation_fields_new_name:
 3829                                annotation_list.append(
 3830                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
 3831                                )
 3832                            else:
 3833                                annotation_list.append(annotation_field)
 3834
 3835                            nb_annotation_field += 1
 3836
 3837                        else:
 3838
 3839                            if annotation_field not in db_hdr_vcf.get_header().infos:
 3840                                log.warning(
 3841                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
 3842                                )
 3843                            if annotation_fields_new_name in self.get_header().infos:
 3844                                log.warning(
 3845                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 3846                                )
 3847
 3848                    log.info(
 3849                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
 3850                    )
 3851
 3852                    annotation_infos = ",".join(annotation_list)
 3853
 3854                    if annotation_infos != "":
 3855
 3856                        # Protect header for bcftools (remove "#CHROM" and variants line)
 3857                        log.debug("Protect Header file - remove #CHROM line if exists")
 3858                        tmp_header_vcf = NamedTemporaryFile(
 3859                            prefix=self.get_prefix(),
 3860                            dir=self.get_tmp_dir(),
 3861                            suffix=".hdr",
 3862                            delete=False,
 3863                        )
 3864                        tmp_header_vcf_name = tmp_header_vcf.name
 3865                        tmp_files.append(tmp_header_vcf_name)
 3866                        # Command
 3867                        if db_hdr_file.endswith(".gz"):
 3868                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3869                        else:
 3870                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
 3871                        # Run
 3872                        run_parallel_commands([command_extract_header], 1)
 3873
 3874                        # Find chomosomes
 3875                        log.debug("Find chromosomes ")
 3876                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
 3877                        sql_query_chromosomes_df = self.get_query_to_df(
 3878                            sql_query_chromosomes
 3879                        )
 3880                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
 3881
 3882                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
 3883
 3884                        # BED columns in the annotation file
 3885                        if db_file_type in ["bed"]:
 3886                            annotation_infos = "CHROM,POS,POS," + annotation_infos
 3887
 3888                        for chrom in chomosomes_list:
 3889
 3890                            # Create BED on initial VCF
 3891                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
 3892                            tmp_bed = NamedTemporaryFile(
 3893                                prefix=self.get_prefix(),
 3894                                dir=self.get_tmp_dir(),
 3895                                suffix=".bed",
 3896                                delete=False,
 3897                            )
 3898                            tmp_bed_name = tmp_bed.name
 3899                            tmp_files.append(tmp_bed_name)
 3900
 3901                            # Detecte regions
 3902                            log.debug(
 3903                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
 3904                            )
 3905                            window = 1000000
 3906                            sql_query_intervals_for_bed = f"""
 3907                                SELECT  \"#CHROM\",
 3908                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
 3909                                        \"POS\"+{window}
 3910                                FROM {table_variants} as table_variants
 3911                                WHERE table_variants.\"#CHROM\" = '{chrom}'
 3912                            """
 3913                            regions = self.conn.execute(
 3914                                sql_query_intervals_for_bed
 3915                            ).fetchall()
 3916                            merged_regions = merge_regions(regions)
 3917                            log.debug(
 3918                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
 3919                            )
 3920
 3921                            header = ["#CHROM", "START", "END"]
 3922                            with open(tmp_bed_name, "w") as f:
 3923                                # Write the header with tab delimiter
 3924                                f.write("\t".join(header) + "\n")
 3925                                for d in merged_regions:
 3926                                    # Write each data row with tab delimiter
 3927                                    f.write("\t".join(map(str, d)) + "\n")
 3928
 3929                            # Tmp files
 3930                            tmp_annotation_vcf = NamedTemporaryFile(
 3931                                prefix=self.get_prefix(),
 3932                                dir=self.get_tmp_dir(),
 3933                                suffix=".vcf.gz",
 3934                                delete=False,
 3935                            )
 3936                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
 3937                            tmp_files.append(tmp_annotation_vcf_name)
 3938                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
 3939                            tmp_annotation_vcf_name_err = (
 3940                                tmp_annotation_vcf_name + ".err"
 3941                            )
 3942                            err_files.append(tmp_annotation_vcf_name_err)
 3943
 3944                            # Annotate Command
 3945                            log.debug(
 3946                                f"Annotation '{annotation}' - add bcftools command"
 3947                            )
 3948
 3949                            # Command
 3950                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
 3951
 3952                            # Add command
 3953                            commands.append(command_annotate)
 3954
 3955            # if some commands
 3956            if commands:
 3957
 3958                # Export VCF file
 3959                self.export_variant_vcf(
 3960                    vcf_file=tmp_vcf_name,
 3961                    remove_info=True,
 3962                    add_samples=False,
 3963                    index=True,
 3964                )
 3965
 3966                # Threads
 3967                # calculate threads for annotated commands
 3968                if commands:
 3969                    threads_bcftools_annotate = round(threads / len(commands))
 3970                else:
 3971                    threads_bcftools_annotate = 1
 3972
 3973                if not threads_bcftools_annotate:
 3974                    threads_bcftools_annotate = 1
 3975
 3976                # Add threads option to bcftools commands
 3977                if threads_bcftools_annotate > 1:
 3978                    commands_threaded = []
 3979                    for command in commands:
 3980                        commands_threaded.append(
 3981                            command.replace(
 3982                                f"{bcftools_bin_command} annotate ",
 3983                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
 3984                            )
 3985                        )
 3986                    commands = commands_threaded
 3987
 3988                # Command annotation multithreading
 3989                log.debug(f"Annotation - Annotation commands: " + str(commands))
 3990                log.info(
 3991                    f"Annotation - Annotation multithreaded in "
 3992                    + str(len(commands))
 3993                    + " commands"
 3994                )
 3995
 3996                run_parallel_commands(commands, threads)
 3997
 3998                # Merge
 3999                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
 4000
 4001                if tmp_ann_vcf_list_cmd:
 4002
 4003                    # Tmp file
 4004                    tmp_annotate_vcf = NamedTemporaryFile(
 4005                        prefix=self.get_prefix(),
 4006                        dir=self.get_tmp_dir(),
 4007                        suffix=".vcf.gz",
 4008                        delete=True,
 4009                    )
 4010                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
 4011                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 4012                    err_files.append(tmp_annotate_vcf_name_err)
 4013
 4014                    # Tmp file remove command
 4015                    tmp_files_remove_command = ""
 4016                    if tmp_files:
 4017                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
 4018
 4019                    # Command merge
 4020                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
 4021                    log.info(
 4022                        f"Annotation - Annotation merging "
 4023                        + str(len(commands))
 4024                        + " annotated files"
 4025                    )
 4026                    log.debug(f"Annotation - merge command: {merge_command}")
 4027                    run_parallel_commands([merge_command], 1)
 4028
 4029                    # Error messages
 4030                    log.info(f"Error/Warning messages:")
 4031                    error_message_command_all = []
 4032                    error_message_command_warning = []
 4033                    error_message_command_err = []
 4034                    for err_file in err_files:
 4035                        with open(err_file, "r") as f:
 4036                            for line in f:
 4037                                message = line.strip()
 4038                                error_message_command_all.append(message)
 4039                                if line.startswith("[W::"):
 4040                                    error_message_command_warning.append(message)
 4041                                if line.startswith("[E::"):
 4042                                    error_message_command_err.append(
 4043                                        f"{err_file}: " + message
 4044                                    )
 4045                    # log info
 4046                    for message in list(
 4047                        set(error_message_command_err + error_message_command_warning)
 4048                    ):
 4049                        log.info(f"   {message}")
 4050                    # debug info
 4051                    for message in list(set(error_message_command_all)):
 4052                        log.debug(f"   {message}")
 4053                    # failed
 4054                    if len(error_message_command_err):
 4055                        log.error("Annotation failed: Error in commands")
 4056                        raise ValueError("Annotation failed: Error in commands")
 4057
 4058                    # Update variants
 4059                    log.info(f"Annotation - Updating...")
 4060                    self.update_from_vcf(tmp_annotate_vcf_name)
 4061
 4062    def annotation_exomiser(self, threads: int = None) -> None:
 4063        """
 4064        This function annotate with Exomiser
 4065
 4066        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
 4067        - "analysis" (dict/file):
 4068            Full analysis dictionnary parameters (see Exomiser docs).
 4069            Either a dict, or a file in JSON or YAML format.
 4070            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
 4071            Default : None
 4072        - "preset" (string):
 4073            Analysis preset (available in config folder).
 4074            Used if no full "analysis" is provided.
 4075            Default: "exome"
 4076        - "phenopacket" (dict/file):
 4077            Samples and phenotipic features parameters (see Exomiser docs).
 4078            Either a dict, or a file in JSON or YAML format.
 4079            Default: None
 4080        - "subject" (dict):
 4081            Sample parameters (see Exomiser docs).
 4082            Example:
 4083                "subject":
 4084                    {
 4085                        "id": "ISDBM322017",
 4086                        "sex": "FEMALE"
 4087                    }
 4088            Default: None
 4089        - "sample" (string):
 4090            Sample name to construct "subject" section:
 4091                "subject":
 4092                    {
 4093                        "id": "<sample>",
 4094                        "sex": "UNKNOWN_SEX"
 4095                    }
 4096            Default: None
 4097        - "phenotypicFeatures" (dict)
 4098            Phenotypic features to construct "subject" section.
 4099            Example:
 4100                "phenotypicFeatures":
 4101                    [
 4102                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
 4103                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
 4104                    ]
 4105        - "hpo" (list)
 4106            List of HPO ids as phenotypic features.
 4107            Example:
 4108                "hpo": ['0001156', '0001363', '0011304', '0010055']
 4109            Default: []
 4110        - "outputOptions" (dict):
 4111            Output options (see Exomiser docs).
 4112            Default:
 4113                "output_options" =
 4114                    {
 4115                        "outputContributingVariantsOnly": False,
 4116                        "numGenes": 0,
 4117                        "outputFormats": ["TSV_VARIANT", "VCF"]
 4118                    }
 4119        - "transcript_source" (string):
 4120            Transcript source (either "refseq", "ucsc", "ensembl")
 4121            Default: "refseq"
 4122        - "exomiser_to_info" (boolean):
 4123            Add exomiser TSV file columns as INFO fields in VCF.
 4124            Default: False
 4125        - "release" (string):
 4126            Exomise database release.
 4127            If not exists, database release will be downloaded (take a while).
 4128            Default: None (provided by application.properties configuration file)
 4129        - "exomiser_application_properties" (file):
 4130            Exomiser configuration file (see Exomiser docs).
 4131            Useful to automatically download databases (especially for specific genome databases).
 4132
 4133        Notes:
 4134        - If no sample in parameters, first sample in VCF will be chosen
 4135        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
 4136
 4137        :param threads: The number of threads to use
 4138        :return: None.
 4139        """
 4140
 4141        # DEBUG
 4142        log.debug("Start annotation with Exomiser databases")
 4143
 4144        # Threads
 4145        if not threads:
 4146            threads = self.get_threads()
 4147        log.debug("Threads: " + str(threads))
 4148
 4149        # Config
 4150        config = self.get_config()
 4151        log.debug("Config: " + str(config))
 4152
 4153        # Config - Folders - Databases
 4154        databases_folders = (
 4155            config.get("folders", {})
 4156            .get("databases", {})
 4157            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
 4158        )
 4159        databases_folders = full_path(databases_folders)
 4160        if not os.path.exists(databases_folders):
 4161            log.error(f"Databases annotations: {databases_folders} NOT found")
 4162        log.debug("Databases annotations: " + str(databases_folders))
 4163
 4164        # Config - Exomiser
 4165        exomiser_bin_command = get_bin_command(
 4166            bin="exomiser-cli*.jar",
 4167            tool="exomiser",
 4168            bin_type="jar",
 4169            config=config,
 4170            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
 4171        )
 4172        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
 4173        if not exomiser_bin_command:
 4174            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
 4175            log.error(msg_err)
 4176            raise ValueError(msg_err)
 4177
 4178        # Param
 4179        param = self.get_param()
 4180        log.debug("Param: " + str(param))
 4181
 4182        # Param - Exomiser
 4183        param_exomiser = param.get("annotation", {}).get("exomiser", {})
 4184        log.debug(f"Param Exomiser: {param_exomiser}")
 4185
 4186        # Param - Assembly
 4187        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4188        log.debug("Assembly: " + str(assembly))
 4189
 4190        # Data
 4191        table_variants = self.get_table_variants()
 4192
 4193        # Check if not empty
 4194        log.debug("Check if not empty")
 4195        sql_query_chromosomes = (
 4196            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4197        )
 4198        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4199            log.info(f"VCF empty")
 4200            return False
 4201
 4202        # VCF header
 4203        vcf_reader = self.get_header()
 4204        log.debug("Initial header: " + str(vcf_reader.infos))
 4205
 4206        # Samples
 4207        samples = self.get_header_sample_list()
 4208        if not samples:
 4209            log.error("No Samples in VCF")
 4210            return False
 4211        log.debug(f"Samples: {samples}")
 4212
 4213        # Memory limit
 4214        memory_limit = self.get_memory("8G")
 4215        log.debug(f"memory_limit: {memory_limit}")
 4216
 4217        # Exomiser java options
 4218        exomiser_java_options = (
 4219            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4220        )
 4221        log.debug(f"Exomiser java options: {exomiser_java_options}")
 4222
 4223        # Download Exomiser (if not exists)
 4224        exomiser_release = param_exomiser.get("release", None)
 4225        exomiser_application_properties = param_exomiser.get(
 4226            "exomiser_application_properties", None
 4227        )
 4228        databases_download_exomiser(
 4229            assemblies=[assembly],
 4230            exomiser_folder=databases_folders,
 4231            exomiser_release=exomiser_release,
 4232            exomiser_phenotype_release=exomiser_release,
 4233            exomiser_application_properties=exomiser_application_properties,
 4234        )
 4235
 4236        # Force annotation
 4237        force_update_annotation = True
 4238
 4239        if "Exomiser" not in self.get_header().infos or force_update_annotation:
 4240            log.debug("Start annotation Exomiser")
 4241
 4242            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
 4243
 4244                # tmp_dir = "/tmp/exomiser"
 4245
 4246                ### ANALYSIS ###
 4247                ################
 4248
 4249                # Create analysis.json through analysis dict
 4250                # either analysis in param or by default
 4251                # depending on preset exome/genome)
 4252
 4253                # Init analysis dict
 4254                param_exomiser_analysis_dict = {}
 4255
 4256                # analysis from param
 4257                param_exomiser_analysis = param_exomiser.get("analysis", {})
 4258                param_exomiser_analysis = full_path(param_exomiser_analysis)
 4259
 4260                # If analysis in param -> load anlaysis json
 4261                if param_exomiser_analysis:
 4262
 4263                    # If param analysis is a file and exists
 4264                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
 4265                        param_exomiser_analysis
 4266                    ):
 4267                        # Load analysis file into analysis dict (either yaml or json)
 4268                        with open(param_exomiser_analysis) as json_file:
 4269                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
 4270
 4271                    # If param analysis is a dict
 4272                    elif isinstance(param_exomiser_analysis, dict):
 4273                        # Load analysis dict into analysis dict (either yaml or json)
 4274                        param_exomiser_analysis_dict = param_exomiser_analysis
 4275
 4276                    # Error analysis type
 4277                    else:
 4278                        log.error(f"Analysis type unknown. Check param file.")
 4279                        raise ValueError(f"Analysis type unknown. Check param file.")
 4280
 4281                # Case no input analysis config file/dict
 4282                # Use preset (exome/genome) to open default config file
 4283                if not param_exomiser_analysis_dict:
 4284
 4285                    # default preset
 4286                    default_preset = "exome"
 4287
 4288                    # Get param preset or default preset
 4289                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
 4290
 4291                    # Try to find if preset is a file
 4292                    if os.path.exists(param_exomiser_preset):
 4293                        # Preset file is provided in full path
 4294                        param_exomiser_analysis_default_config_file = (
 4295                            param_exomiser_preset
 4296                        )
 4297                    # elif os.path.exists(full_path(param_exomiser_preset)):
 4298                    #     # Preset file is provided in full path
 4299                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
 4300                    elif os.path.exists(
 4301                        os.path.join(folder_config, param_exomiser_preset)
 4302                    ):
 4303                        # Preset file is provided a basename in config folder (can be a path with subfolders)
 4304                        param_exomiser_analysis_default_config_file = os.path.join(
 4305                            folder_config, param_exomiser_preset
 4306                        )
 4307                    else:
 4308                        # Construct preset file
 4309                        param_exomiser_analysis_default_config_file = os.path.join(
 4310                            folder_config,
 4311                            f"preset-{param_exomiser_preset}-analysis.json",
 4312                        )
 4313
 4314                    # If preset file exists
 4315                    param_exomiser_analysis_default_config_file = full_path(
 4316                        param_exomiser_analysis_default_config_file
 4317                    )
 4318                    if os.path.exists(param_exomiser_analysis_default_config_file):
 4319                        # Load prest file into analysis dict (either yaml or json)
 4320                        with open(
 4321                            param_exomiser_analysis_default_config_file
 4322                        ) as json_file:
 4323                            # param_exomiser_analysis_dict[""] = json.load(json_file)
 4324                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
 4325                                json_file
 4326                            )
 4327
 4328                    # Error preset file
 4329                    else:
 4330                        log.error(
 4331                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4332                        )
 4333                        raise ValueError(
 4334                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
 4335                        )
 4336
 4337                # If no analysis dict created
 4338                if not param_exomiser_analysis_dict:
 4339                    log.error(f"No analysis config")
 4340                    raise ValueError(f"No analysis config")
 4341
 4342                # Log
 4343                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4344
 4345                ### PHENOPACKET ###
 4346                ###################
 4347
 4348                # If no PhenoPacket in analysis dict -> check in param
 4349                if "phenopacket" not in param_exomiser_analysis_dict:
 4350
 4351                    # If PhenoPacket in param -> load anlaysis json
 4352                    if param_exomiser.get("phenopacket", None):
 4353
 4354                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
 4355                        param_exomiser_phenopacket = full_path(
 4356                            param_exomiser_phenopacket
 4357                        )
 4358
 4359                        # If param phenopacket is a file and exists
 4360                        if isinstance(
 4361                            param_exomiser_phenopacket, str
 4362                        ) and os.path.exists(param_exomiser_phenopacket):
 4363                            # Load phenopacket file into analysis dict (either yaml or json)
 4364                            with open(param_exomiser_phenopacket) as json_file:
 4365                                param_exomiser_analysis_dict["phenopacket"] = (
 4366                                    yaml.safe_load(json_file)
 4367                                )
 4368
 4369                        # If param phenopacket is a dict
 4370                        elif isinstance(param_exomiser_phenopacket, dict):
 4371                            # Load phenopacket dict into analysis dict (either yaml or json)
 4372                            param_exomiser_analysis_dict["phenopacket"] = (
 4373                                param_exomiser_phenopacket
 4374                            )
 4375
 4376                        # Error phenopacket type
 4377                        else:
 4378                            log.error(f"Phenopacket type unknown. Check param file.")
 4379                            raise ValueError(
 4380                                f"Phenopacket type unknown. Check param file."
 4381                            )
 4382
 4383                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
 4384                if "phenopacket" not in param_exomiser_analysis_dict:
 4385
 4386                    # Init PhenoPacket
 4387                    param_exomiser_analysis_dict["phenopacket"] = {
 4388                        "id": "analysis",
 4389                        "proband": {},
 4390                    }
 4391
 4392                    ### Add subject ###
 4393
 4394                    # If subject exists
 4395                    param_exomiser_subject = param_exomiser.get("subject", {})
 4396
 4397                    # If subject not exists -> found sample ID
 4398                    if not param_exomiser_subject:
 4399
 4400                        # Found sample ID in param
 4401                        sample = param_exomiser.get("sample", None)
 4402
 4403                        # Find sample ID (first sample)
 4404                        if not sample:
 4405                            sample_list = self.get_header_sample_list()
 4406                            if len(sample_list) > 0:
 4407                                sample = sample_list[0]
 4408                            else:
 4409                                log.error(f"No sample found")
 4410                                raise ValueError(f"No sample found")
 4411
 4412                        # Create subject
 4413                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
 4414
 4415                    # Add to dict
 4416                    param_exomiser_analysis_dict["phenopacket"][
 4417                        "subject"
 4418                    ] = param_exomiser_subject
 4419
 4420                    ### Add "phenotypicFeatures" ###
 4421
 4422                    # If phenotypicFeatures exists
 4423                    param_exomiser_phenotypicfeatures = param_exomiser.get(
 4424                        "phenotypicFeatures", []
 4425                    )
 4426
 4427                    # If phenotypicFeatures not exists -> Try to infer from hpo list
 4428                    if not param_exomiser_phenotypicfeatures:
 4429
 4430                        # Found HPO in param
 4431                        param_exomiser_hpo = param_exomiser.get("hpo", [])
 4432
 4433                        # Split HPO if list in string format separated by comma
 4434                        if isinstance(param_exomiser_hpo, str):
 4435                            param_exomiser_hpo = param_exomiser_hpo.split(",")
 4436
 4437                        # Create HPO list
 4438                        for hpo in param_exomiser_hpo:
 4439                            hpo_clean = re.sub("[^0-9]", "", hpo)
 4440                            param_exomiser_phenotypicfeatures.append(
 4441                                {
 4442                                    "type": {
 4443                                        "id": f"HP:{hpo_clean}",
 4444                                        "label": f"HP:{hpo_clean}",
 4445                                    }
 4446                                }
 4447                            )
 4448
 4449                    # Add to dict
 4450                    param_exomiser_analysis_dict["phenopacket"][
 4451                        "phenotypicFeatures"
 4452                    ] = param_exomiser_phenotypicfeatures
 4453
 4454                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
 4455                    if not param_exomiser_phenotypicfeatures:
 4456                        for step in param_exomiser_analysis_dict.get(
 4457                            "analysis", {}
 4458                        ).get("steps", []):
 4459                            if "hiPhivePrioritiser" in step:
 4460                                param_exomiser_analysis_dict.get("analysis", {}).get(
 4461                                    "steps", []
 4462                                ).remove(step)
 4463
 4464                ### Add Input File ###
 4465
 4466                # Initial file name and htsFiles
 4467                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
 4468                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
 4469                    {
 4470                        "uri": tmp_vcf_name,
 4471                        "htsFormat": "VCF",
 4472                        "genomeAssembly": assembly,
 4473                    }
 4474                ]
 4475
 4476                ### Add metaData ###
 4477
 4478                # If metaData not in analysis dict
 4479                if "metaData" not in param_exomiser_analysis_dict:
 4480                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
 4481                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
 4482                        "createdBy": "howard",
 4483                        "phenopacketSchemaVersion": 1,
 4484                    }
 4485
 4486                ### OutputOptions ###
 4487
 4488                # Init output result folder
 4489                output_results = os.path.join(tmp_dir, "results")
 4490
 4491                # If no outputOptions in analysis dict
 4492                if "outputOptions" not in param_exomiser_analysis_dict:
 4493
 4494                    # default output formats
 4495                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
 4496
 4497                    # Get outputOptions in param
 4498                    output_options = param_exomiser.get("outputOptions", None)
 4499
 4500                    # If no output_options in param -> check
 4501                    if not output_options:
 4502                        output_options = {
 4503                            "outputContributingVariantsOnly": False,
 4504                            "numGenes": 0,
 4505                            "outputFormats": defaut_output_formats,
 4506                        }
 4507
 4508                    # Replace outputDirectory in output options
 4509                    output_options["outputDirectory"] = output_results
 4510                    output_options["outputFileName"] = "howard"
 4511
 4512                    # Add outputOptions in analysis dict
 4513                    param_exomiser_analysis_dict["outputOptions"] = output_options
 4514
 4515                else:
 4516
 4517                    # Replace output_results and output format (if exists in param)
 4518                    param_exomiser_analysis_dict["outputOptions"][
 4519                        "outputDirectory"
 4520                    ] = output_results
 4521                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
 4522                        list(
 4523                            set(
 4524                                param_exomiser_analysis_dict.get(
 4525                                    "outputOptions", {}
 4526                                ).get("outputFormats", [])
 4527                                + ["TSV_VARIANT", "VCF"]
 4528                            )
 4529                        )
 4530                    )
 4531
 4532                # log
 4533                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
 4534
 4535                ### ANALYSIS FILE ###
 4536                #####################
 4537
 4538                ### Full JSON analysis config file ###
 4539
 4540                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
 4541                with open(exomiser_analysis, "w") as fp:
 4542                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
 4543
 4544                ### SPLIT analysis and sample config files
 4545
 4546                # Splitted analysis dict
 4547                param_exomiser_analysis_dict_for_split = (
 4548                    param_exomiser_analysis_dict.copy()
 4549                )
 4550
 4551                # Phenopacket JSON file
 4552                exomiser_analysis_phenopacket = os.path.join(
 4553                    tmp_dir, "analysis_phenopacket.json"
 4554                )
 4555                with open(exomiser_analysis_phenopacket, "w") as fp:
 4556                    json.dump(
 4557                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
 4558                        fp,
 4559                        indent=4,
 4560                    )
 4561
 4562                # Analysis JSON file without Phenopacket parameters
 4563                param_exomiser_analysis_dict_for_split.pop("phenopacket")
 4564                exomiser_analysis_analysis = os.path.join(
 4565                    tmp_dir, "analysis_analysis.json"
 4566                )
 4567                with open(exomiser_analysis_analysis, "w") as fp:
 4568                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
 4569
 4570                ### INITAL VCF file ###
 4571                #######################
 4572
 4573                ### Create list of samples to use and include inti initial VCF file ####
 4574
 4575                # Subject (main sample)
 4576                # Get sample ID in analysis dict
 4577                sample_subject = (
 4578                    param_exomiser_analysis_dict.get("phenopacket", {})
 4579                    .get("subject", {})
 4580                    .get("id", None)
 4581                )
 4582                sample_proband = (
 4583                    param_exomiser_analysis_dict.get("phenopacket", {})
 4584                    .get("proband", {})
 4585                    .get("subject", {})
 4586                    .get("id", None)
 4587                )
 4588                sample = []
 4589                if sample_subject:
 4590                    sample.append(sample_subject)
 4591                if sample_proband:
 4592                    sample.append(sample_proband)
 4593
 4594                # Get sample ID within Pedigree
 4595                pedigree_persons_list = (
 4596                    param_exomiser_analysis_dict.get("phenopacket", {})
 4597                    .get("pedigree", {})
 4598                    .get("persons", {})
 4599                )
 4600
 4601                # Create list with all sample ID in pedigree (if exists)
 4602                pedigree_persons = []
 4603                for person in pedigree_persons_list:
 4604                    pedigree_persons.append(person.get("individualId"))
 4605
 4606                # Concat subject sample ID and samples ID in pedigreesamples
 4607                samples = list(set(sample + pedigree_persons))
 4608
 4609                # Check if sample list is not empty
 4610                if not samples:
 4611                    log.error(f"No samples found")
 4612                    raise ValueError(f"No samples found")
 4613
 4614                # Create VCF with sample (either sample in param or first one by default)
 4615                # Export VCF file
 4616                self.export_variant_vcf(
 4617                    vcf_file=tmp_vcf_name,
 4618                    remove_info=True,
 4619                    add_samples=True,
 4620                    list_samples=samples,
 4621                    index=False,
 4622                )
 4623
 4624                ### Execute Exomiser ###
 4625                ########################
 4626
 4627                # Init command
 4628                exomiser_command = ""
 4629
 4630                # Command exomiser options
 4631                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
 4632
 4633                # Release
 4634                exomiser_release = param_exomiser.get("release", None)
 4635                if exomiser_release:
 4636                    # phenotype data version
 4637                    exomiser_options += (
 4638                        f" --exomiser.phenotype.data-version={exomiser_release} "
 4639                    )
 4640                    # data version
 4641                    exomiser_options += (
 4642                        f" --exomiser.{assembly}.data-version={exomiser_release} "
 4643                    )
 4644                    # variant white list
 4645                    variant_white_list_file = (
 4646                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
 4647                    )
 4648                    if os.path.exists(
 4649                        os.path.join(
 4650                            databases_folders, assembly, variant_white_list_file
 4651                        )
 4652                    ):
 4653                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
 4654
 4655                # transcript_source
 4656                transcript_source = param_exomiser.get(
 4657                    "transcript_source", None
 4658                )  # ucsc, refseq, ensembl
 4659                if transcript_source:
 4660                    exomiser_options += (
 4661                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
 4662                    )
 4663
 4664                # If analysis contain proband param
 4665                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
 4666                    "proband", {}
 4667                ):
 4668                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
 4669
 4670                # If no proband (usually uniq sample)
 4671                else:
 4672                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
 4673
 4674                # Log
 4675                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
 4676
 4677                # Run command
 4678                result = subprocess.call(
 4679                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
 4680                )
 4681                if result:
 4682                    log.error("Exomiser command failed")
 4683                    raise ValueError("Exomiser command failed")
 4684
 4685                ### RESULTS ###
 4686                ###############
 4687
 4688                ### Annotate with TSV fields ###
 4689
 4690                # Init result tsv file
 4691                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
 4692
 4693                # Init result tsv file
 4694                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
 4695
 4696                # Parse TSV file and explode columns in INFO field
 4697                if exomiser_to_info and os.path.exists(output_results_tsv):
 4698
 4699                    # Log
 4700                    log.debug("Exomiser columns to VCF INFO field")
 4701
 4702                    # Retrieve columns and types
 4703                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
 4704                    output_results_tsv_df = self.get_query_to_df(query)
 4705                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
 4706
 4707                    # Init concat fields for update
 4708                    sql_query_update_concat_fields = []
 4709
 4710                    # Fields to avoid
 4711                    fields_to_avoid = [
 4712                        "CONTIG",
 4713                        "START",
 4714                        "END",
 4715                        "REF",
 4716                        "ALT",
 4717                        "QUAL",
 4718                        "FILTER",
 4719                        "GENOTYPE",
 4720                    ]
 4721
 4722                    # List all columns to add into header
 4723                    for header_column in output_results_tsv_columns:
 4724
 4725                        # If header column is enable
 4726                        if header_column not in fields_to_avoid:
 4727
 4728                            # Header info type
 4729                            header_info_type = "String"
 4730                            header_column_df = output_results_tsv_df[header_column]
 4731                            header_column_df_dtype = header_column_df.dtype
 4732                            if header_column_df_dtype == object:
 4733                                if (
 4734                                    pd.to_numeric(header_column_df, errors="coerce")
 4735                                    .notnull()
 4736                                    .all()
 4737                                ):
 4738                                    header_info_type = "Float"
 4739                            else:
 4740                                header_info_type = "Integer"
 4741
 4742                            # Header info
 4743                            characters_to_validate = ["-"]
 4744                            pattern = "[" + "".join(characters_to_validate) + "]"
 4745                            header_info_name = re.sub(
 4746                                pattern,
 4747                                "_",
 4748                                f"Exomiser_{header_column}".replace("#", ""),
 4749                            )
 4750                            header_info_number = "."
 4751                            header_info_description = (
 4752                                f"Exomiser {header_column} annotation"
 4753                            )
 4754                            header_info_source = "Exomiser"
 4755                            header_info_version = "unknown"
 4756                            header_info_code = CODE_TYPE_MAP[header_info_type]
 4757                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
 4758                                header_info_name,
 4759                                header_info_number,
 4760                                header_info_type,
 4761                                header_info_description,
 4762                                header_info_source,
 4763                                header_info_version,
 4764                                header_info_code,
 4765                            )
 4766
 4767                            # Add field to add for update to concat fields
 4768                            sql_query_update_concat_fields.append(
 4769                                f"""
 4770                                CASE
 4771                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
 4772                                    THEN concat(
 4773                                        '{header_info_name}=',
 4774                                        table_parquet."{header_column}",
 4775                                        ';'
 4776                                        )
 4777
 4778                                    ELSE ''
 4779                                END
 4780                            """
 4781                            )
 4782
 4783                    # Update query
 4784                    sql_query_update = f"""
 4785                        UPDATE {table_variants} as table_variants
 4786                            SET INFO = concat(
 4787                                            CASE
 4788                                                WHEN INFO NOT IN ('', '.')
 4789                                                THEN INFO
 4790                                                ELSE ''
 4791                                            END,
 4792                                            CASE
 4793                                                WHEN table_variants.INFO NOT IN ('','.')
 4794                                                THEN ';'
 4795                                                ELSE ''
 4796                                            END,
 4797                                            (
 4798                                            SELECT 
 4799                                                concat(
 4800                                                    {",".join(sql_query_update_concat_fields)}
 4801                                                )
 4802                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
 4803                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
 4804                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
 4805                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 4806                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 4807                                            )
 4808                                        )
 4809                            ;
 4810                        """
 4811
 4812                    # Update
 4813                    self.conn.execute(sql_query_update)
 4814
 4815                ### Annotate with VCF INFO field ###
 4816
 4817                # Init result VCF file
 4818                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
 4819
 4820                # If VCF exists
 4821                if os.path.exists(output_results_vcf):
 4822
 4823                    # Log
 4824                    log.debug("Exomiser result VCF update variants")
 4825
 4826                    # Find Exomiser INFO field annotation in header
 4827                    with gzip.open(output_results_vcf, "rt") as f:
 4828                        header_list = self.read_vcf_header(f)
 4829                    exomiser_vcf_header = vcf.Reader(
 4830                        io.StringIO("\n".join(header_list))
 4831                    )
 4832
 4833                    # Add annotation INFO field to header
 4834                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
 4835
 4836                    # Update variants with VCF
 4837                    self.update_from_vcf(output_results_vcf)
 4838
 4839        return True
 4840
 4841    def annotation_snpeff(self, threads: int = None) -> None:
 4842        """
 4843        This function annotate with snpEff
 4844
 4845        :param threads: The number of threads to use
 4846        :return: the value of the variable "return_value".
 4847        """
 4848
 4849        # DEBUG
 4850        log.debug("Start annotation with snpeff databases")
 4851
 4852        # Threads
 4853        if not threads:
 4854            threads = self.get_threads()
 4855        log.debug("Threads: " + str(threads))
 4856
 4857        # DEBUG
 4858        delete_tmp = True
 4859        if self.get_config().get("verbosity", "warning") in ["debug"]:
 4860            delete_tmp = False
 4861            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 4862
 4863        # Config
 4864        config = self.get_config()
 4865        log.debug("Config: " + str(config))
 4866
 4867        # Config - Folders - Databases
 4868        databases_folders = (
 4869            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
 4870        )
 4871        log.debug("Databases annotations: " + str(databases_folders))
 4872
 4873        # # Config - Java
 4874        # java_bin = get_bin(
 4875        #     tool="java",
 4876        #     bin="java",
 4877        #     bin_type="bin",
 4878        #     config=config,
 4879        #     default_folder="/usr/bin",
 4880        # )
 4881        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
 4882        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
 4883        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
 4884
 4885        # # Config - snpEff bin
 4886        # snpeff_jar = get_bin(
 4887        #     tool="snpeff",
 4888        #     bin="snpEff.jar",
 4889        #     bin_type="jar",
 4890        #     config=config,
 4891        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4892        # )
 4893        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
 4894        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4895        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
 4896
 4897        # Config - snpEff bin command
 4898        snpeff_bin_command = get_bin_command(
 4899            bin="snpEff.jar",
 4900            tool="snpeff",
 4901            bin_type="jar",
 4902            config=config,
 4903            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
 4904        )
 4905        if not snpeff_bin_command:
 4906            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
 4907            log.error(msg_err)
 4908            raise ValueError(msg_err)
 4909
 4910        # Config - snpEff databases
 4911        snpeff_databases = (
 4912            config.get("folders", {})
 4913            .get("databases", {})
 4914            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
 4915        )
 4916        snpeff_databases = full_path(snpeff_databases)
 4917        if snpeff_databases is not None and snpeff_databases != "":
 4918            log.debug(f"Create snpEff databases folder")
 4919            if not os.path.exists(snpeff_databases):
 4920                os.makedirs(snpeff_databases)
 4921
 4922        # Param
 4923        param = self.get_param()
 4924        log.debug("Param: " + str(param))
 4925
 4926        # Param
 4927        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
 4928        log.debug("Options: " + str(options))
 4929
 4930        # Param - Assembly
 4931        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 4932
 4933        # Param - Options
 4934        snpeff_options = (
 4935            param.get("annotation", {}).get("snpeff", {}).get("options", "")
 4936        )
 4937        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
 4938        snpeff_csvstats = (
 4939            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
 4940        )
 4941        if snpeff_stats:
 4942            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
 4943            snpeff_stats = full_path(snpeff_stats)
 4944            snpeff_options += f" -stats {snpeff_stats}"
 4945        if snpeff_csvstats:
 4946            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
 4947            snpeff_csvstats = full_path(snpeff_csvstats)
 4948            snpeff_options += f" -csvStats {snpeff_csvstats}"
 4949
 4950        # Data
 4951        table_variants = self.get_table_variants()
 4952
 4953        # Check if not empty
 4954        log.debug("Check if not empty")
 4955        sql_query_chromosomes = (
 4956            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 4957        )
 4958        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
 4959        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 4960            log.info(f"VCF empty")
 4961            return
 4962
 4963        # Export in VCF
 4964        log.debug("Create initial file to annotate")
 4965        tmp_vcf = NamedTemporaryFile(
 4966            prefix=self.get_prefix(),
 4967            dir=self.get_tmp_dir(),
 4968            suffix=".vcf.gz",
 4969            delete=True,
 4970        )
 4971        tmp_vcf_name = tmp_vcf.name
 4972
 4973        # VCF header
 4974        vcf_reader = self.get_header()
 4975        log.debug("Initial header: " + str(vcf_reader.infos))
 4976
 4977        # Existing annotations
 4978        for vcf_annotation in self.get_header().infos:
 4979
 4980            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 4981            log.debug(
 4982                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 4983            )
 4984
 4985        # Memory limit
 4986        # if config.get("memory", None):
 4987        #     memory_limit = config.get("memory", "8G")
 4988        # else:
 4989        #     memory_limit = "8G"
 4990        memory_limit = self.get_memory("8G")
 4991        log.debug(f"memory_limit: {memory_limit}")
 4992
 4993        # snpEff java options
 4994        snpeff_java_options = (
 4995            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
 4996        )
 4997        log.debug(f"Exomiser java options: {snpeff_java_options}")
 4998
 4999        force_update_annotation = True
 5000
 5001        if "ANN" not in self.get_header().infos or force_update_annotation:
 5002
 5003            # Check snpEff database
 5004            log.debug(f"Check snpEff databases {[assembly]}")
 5005            databases_download_snpeff(
 5006                folder=snpeff_databases, assemblies=[assembly], config=config
 5007            )
 5008
 5009            # Export VCF file
 5010            self.export_variant_vcf(
 5011                vcf_file=tmp_vcf_name,
 5012                remove_info=True,
 5013                add_samples=False,
 5014                index=True,
 5015            )
 5016
 5017            # Tmp file
 5018            err_files = []
 5019            tmp_annotate_vcf = NamedTemporaryFile(
 5020                prefix=self.get_prefix(),
 5021                dir=self.get_tmp_dir(),
 5022                suffix=".vcf",
 5023                delete=False,
 5024            )
 5025            tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5026            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5027            err_files.append(tmp_annotate_vcf_name_err)
 5028
 5029            # Command
 5030            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
 5031            log.debug(f"Annotation - snpEff command: {snpeff_command}")
 5032            run_parallel_commands([snpeff_command], 1)
 5033
 5034            # Error messages
 5035            log.info(f"Error/Warning messages:")
 5036            error_message_command_all = []
 5037            error_message_command_warning = []
 5038            error_message_command_err = []
 5039            for err_file in err_files:
 5040                with open(err_file, "r") as f:
 5041                    for line in f:
 5042                        message = line.strip()
 5043                        error_message_command_all.append(message)
 5044                        if line.startswith("[W::"):
 5045                            error_message_command_warning.append(message)
 5046                        if line.startswith("[E::"):
 5047                            error_message_command_err.append(f"{err_file}: " + message)
 5048            # log info
 5049            for message in list(
 5050                set(error_message_command_err + error_message_command_warning)
 5051            ):
 5052                log.info(f"   {message}")
 5053            # debug info
 5054            for message in list(set(error_message_command_all)):
 5055                log.debug(f"   {message}")
 5056            # failed
 5057            if len(error_message_command_err):
 5058                log.error("Annotation failed: Error in commands")
 5059                raise ValueError("Annotation failed: Error in commands")
 5060
 5061            # Find annotation in header
 5062            with open(tmp_annotate_vcf_name, "rt") as f:
 5063                header_list = self.read_vcf_header(f)
 5064            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5065
 5066            for ann in annovar_vcf_header.infos:
 5067                if ann not in self.get_header().infos:
 5068                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5069
 5070            # Update variants
 5071            log.info(f"Annotation - Updating...")
 5072            self.update_from_vcf(tmp_annotate_vcf_name)
 5073
 5074        else:
 5075            if "ANN" in self.get_header().infos:
 5076                log.debug(f"Existing snpEff annotations in VCF")
 5077            if force_update_annotation:
 5078                log.debug(f"Existing snpEff annotations in VCF - annotation forced")
 5079
 5080    def annotation_annovar(self, threads: int = None) -> None:
 5081        """
 5082        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
 5083        annotations
 5084
 5085        :param threads: number of threads to use
 5086        :return: the value of the variable "return_value".
 5087        """
 5088
 5089        # DEBUG
 5090        log.debug("Start annotation with Annovar databases")
 5091
 5092        # Threads
 5093        if not threads:
 5094            threads = self.get_threads()
 5095        log.debug("Threads: " + str(threads))
 5096
 5097        # Tmp en Err files
 5098        tmp_files = []
 5099        err_files = []
 5100
 5101        # DEBUG
 5102        delete_tmp = True
 5103        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5104            delete_tmp = False
 5105            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5106
 5107        # Config
 5108        config = self.get_config()
 5109        log.debug("Config: " + str(config))
 5110
 5111        # Config - Folders - Databases
 5112        databases_folders = (
 5113            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
 5114        )
 5115        log.debug("Databases annotations: " + str(databases_folders))
 5116
 5117        # Config - annovar bin command
 5118        annovar_bin_command = get_bin_command(
 5119            bin="table_annovar.pl",
 5120            tool="annovar",
 5121            bin_type="perl",
 5122            config=config,
 5123            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
 5124        )
 5125        if not annovar_bin_command:
 5126            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
 5127            log.error(msg_err)
 5128            raise ValueError(msg_err)
 5129
 5130        # Config - BCFTools bin command
 5131        bcftools_bin_command = get_bin_command(
 5132            bin="bcftools",
 5133            tool="bcftools",
 5134            bin_type="bin",
 5135            config=config,
 5136            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
 5137        )
 5138        if not bcftools_bin_command:
 5139            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
 5140            log.error(msg_err)
 5141            raise ValueError(msg_err)
 5142
 5143        # Config - annovar databases
 5144        annovar_databases = (
 5145            config.get("folders", {})
 5146            .get("databases", {})
 5147            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
 5148        )
 5149        annovar_databases = full_path(annovar_databases)
 5150        if annovar_databases != "" and not os.path.exists(annovar_databases):
 5151            os.makedirs(annovar_databases)
 5152
 5153        # Param
 5154        param = self.get_param()
 5155        log.debug("Param: " + str(param))
 5156
 5157        # Param - options
 5158        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
 5159        log.debug("Options: " + str(options))
 5160
 5161        # Param - annotations
 5162        annotations = (
 5163            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
 5164        )
 5165        log.debug("Annotations: " + str(annotations))
 5166
 5167        # Param - Assembly
 5168        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 5169
 5170        # Annovar database assembly
 5171        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
 5172        if annovar_databases_assembly != "" and not os.path.exists(
 5173            annovar_databases_assembly
 5174        ):
 5175            os.makedirs(annovar_databases_assembly)
 5176
 5177        # Data
 5178        table_variants = self.get_table_variants()
 5179
 5180        # Check if not empty
 5181        log.debug("Check if not empty")
 5182        sql_query_chromosomes = (
 5183            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 5184        )
 5185        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
 5186        if not sql_query_chromosomes_df["count"][0]:
 5187            log.info(f"VCF empty")
 5188            return
 5189
 5190        # VCF header
 5191        vcf_reader = self.get_header()
 5192        log.debug("Initial header: " + str(vcf_reader.infos))
 5193
 5194        # Existing annotations
 5195        for vcf_annotation in self.get_header().infos:
 5196
 5197            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5198            log.debug(
 5199                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5200            )
 5201
 5202        force_update_annotation = True
 5203
 5204        if annotations:
 5205
 5206            commands = []
 5207            tmp_annotates_vcf_name_list = []
 5208
 5209            # Export in VCF
 5210            log.debug("Create initial file to annotate")
 5211            tmp_vcf = NamedTemporaryFile(
 5212                prefix=self.get_prefix(),
 5213                dir=self.get_tmp_dir(),
 5214                suffix=".vcf.gz",
 5215                delete=False,
 5216            )
 5217            tmp_vcf_name = tmp_vcf.name
 5218            tmp_files.append(tmp_vcf_name)
 5219            tmp_files.append(tmp_vcf_name + ".tbi")
 5220
 5221            # Export VCF file
 5222            self.export_variant_vcf(
 5223                vcf_file=tmp_vcf_name,
 5224                remove_info=".",
 5225                add_samples=False,
 5226                index=True,
 5227            )
 5228
 5229            # Create file for field rename
 5230            log.debug("Create file for field rename")
 5231            tmp_rename = NamedTemporaryFile(
 5232                prefix=self.get_prefix(),
 5233                dir=self.get_tmp_dir(),
 5234                suffix=".rename",
 5235                delete=False,
 5236            )
 5237            tmp_rename_name = tmp_rename.name
 5238            tmp_files.append(tmp_rename_name)
 5239
 5240            # Check Annovar database
 5241            log.debug(
 5242                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
 5243            )
 5244            databases_download_annovar(
 5245                folder=annovar_databases,
 5246                files=list(annotations.keys()),
 5247                assemblies=[assembly],
 5248            )
 5249
 5250            for annotation in annotations:
 5251                annotation_fields = annotations[annotation]
 5252
 5253                if not annotation_fields:
 5254                    annotation_fields = {"INFO": None}
 5255
 5256                log.info(f"Annotations Annovar - database '{annotation}'")
 5257                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
 5258
 5259                # Tmp file for annovar
 5260                err_files = []
 5261                tmp_annotate_vcf_directory = TemporaryDirectory(
 5262                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
 5263                )
 5264                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
 5265                tmp_annotate_vcf_name_annovar = (
 5266                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
 5267                )
 5268                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
 5269                err_files.append(tmp_annotate_vcf_name_err)
 5270                tmp_files.append(tmp_annotate_vcf_name_err)
 5271
 5272                # Tmp file final vcf annotated by annovar
 5273                tmp_annotate_vcf = NamedTemporaryFile(
 5274                    prefix=self.get_prefix(),
 5275                    dir=self.get_tmp_dir(),
 5276                    suffix=".vcf.gz",
 5277                    delete=False,
 5278                )
 5279                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5280                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
 5281                tmp_files.append(tmp_annotate_vcf_name)
 5282                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
 5283
 5284                # Number of fields
 5285                annotation_list = []
 5286                annotation_renamed_list = []
 5287
 5288                for annotation_field in annotation_fields:
 5289
 5290                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
 5291                    annotation_fields_new_name = annotation_fields.get(
 5292                        annotation_field, annotation_field
 5293                    )
 5294                    if not annotation_fields_new_name:
 5295                        annotation_fields_new_name = annotation_field
 5296
 5297                    if (
 5298                        force_update_annotation
 5299                        or annotation_fields_new_name not in self.get_header().infos
 5300                    ):
 5301                        annotation_list.append(annotation_field)
 5302                        annotation_renamed_list.append(annotation_fields_new_name)
 5303                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
 5304                        log.warning(
 5305                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
 5306                        )
 5307
 5308                    # Add rename info
 5309                    run_parallel_commands(
 5310                        [
 5311                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
 5312                        ],
 5313                        1,
 5314                    )
 5315
 5316                # log.debug("fields_to_removed: " + str(fields_to_removed))
 5317                log.debug("annotation_list: " + str(annotation_list))
 5318
 5319                # protocol
 5320                protocol = annotation
 5321
 5322                # argument
 5323                argument = ""
 5324
 5325                # operation
 5326                operation = "f"
 5327                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
 5328                    "ensGene"
 5329                ):
 5330                    operation = "g"
 5331                    if options.get("genebase", None):
 5332                        argument = f"""'{options.get("genebase","")}'"""
 5333                elif annotation in ["cytoBand"]:
 5334                    operation = "r"
 5335
 5336                # argument option
 5337                argument_option = ""
 5338                if argument != "":
 5339                    argument_option = " --argument " + argument
 5340
 5341                # command options
 5342                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
 5343                for option in options:
 5344                    if option not in ["genebase"]:
 5345                        command_options += f""" --{option}={options[option]}"""
 5346
 5347                # Command
 5348
 5349                # Command - Annovar
 5350                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
 5351                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
 5352
 5353                # Command - start pipe
 5354                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
 5355
 5356                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
 5357                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
 5358
 5359                # Command - Special characters (refGene annotation)
 5360                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
 5361
 5362                # Command - Clean empty fields (with value ".")
 5363                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
 5364
 5365                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
 5366                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
 5367                if "ALL" not in annotation_list and "INFO" not in annotation_list:
 5368                    # for ann in annotation_renamed_list:
 5369                    for ann in annotation_list:
 5370                        annovar_fields_to_keep.append(f"^INFO/{ann}")
 5371
 5372                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
 5373
 5374                # Command - indexing
 5375                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
 5376
 5377                log.debug(f"Annotation - Annovar command: {command_annovar}")
 5378                run_parallel_commands([command_annovar], 1)
 5379
 5380                # Error messages
 5381                log.info(f"Error/Warning messages:")
 5382                error_message_command_all = []
 5383                error_message_command_warning = []
 5384                error_message_command_err = []
 5385                for err_file in err_files:
 5386                    with open(err_file, "r") as f:
 5387                        for line in f:
 5388                            message = line.strip()
 5389                            error_message_command_all.append(message)
 5390                            if line.startswith("[W::") or line.startswith("WARNING"):
 5391                                error_message_command_warning.append(message)
 5392                            if line.startswith("[E::") or line.startswith("ERROR"):
 5393                                error_message_command_err.append(
 5394                                    f"{err_file}: " + message
 5395                                )
 5396                # log info
 5397                for message in list(
 5398                    set(error_message_command_err + error_message_command_warning)
 5399                ):
 5400                    log.info(f"   {message}")
 5401                # debug info
 5402                for message in list(set(error_message_command_all)):
 5403                    log.debug(f"   {message}")
 5404                # failed
 5405                if len(error_message_command_err):
 5406                    log.error("Annotation failed: Error in commands")
 5407                    raise ValueError("Annotation failed: Error in commands")
 5408
 5409            if tmp_annotates_vcf_name_list:
 5410
 5411                # List of annotated files
 5412                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
 5413
 5414                # Tmp file
 5415                tmp_annotate_vcf = NamedTemporaryFile(
 5416                    prefix=self.get_prefix(),
 5417                    dir=self.get_tmp_dir(),
 5418                    suffix=".vcf.gz",
 5419                    delete=False,
 5420                )
 5421                tmp_annotate_vcf_name = tmp_annotate_vcf.name
 5422                tmp_files.append(tmp_annotate_vcf_name)
 5423                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
 5424                err_files.append(tmp_annotate_vcf_name_err)
 5425                tmp_files.append(tmp_annotate_vcf_name_err)
 5426
 5427                # Command merge
 5428                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
 5429                log.info(
 5430                    f"Annotation Annovar - Annotation merging "
 5431                    + str(len(tmp_annotates_vcf_name_list))
 5432                    + " annotated files"
 5433                )
 5434                log.debug(f"Annotation - merge command: {merge_command}")
 5435                run_parallel_commands([merge_command], 1)
 5436
 5437                # Find annotation in header
 5438                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
 5439                    header_list = self.read_vcf_header(f)
 5440                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
 5441
 5442                for ann in annovar_vcf_header.infos:
 5443                    if ann not in self.get_header().infos:
 5444                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
 5445
 5446                # Update variants
 5447                log.info(f"Annotation Annovar - Updating...")
 5448                self.update_from_vcf(tmp_annotate_vcf_name)
 5449
 5450            # Clean files
 5451            # Tmp file remove command
 5452            if True:
 5453                tmp_files_remove_command = ""
 5454                if tmp_files:
 5455                    tmp_files_remove_command = " ".join(tmp_files)
 5456                clean_command = f" rm -f {tmp_files_remove_command} "
 5457                log.debug(f"Annotation Annovar - Annotation cleaning ")
 5458                log.debug(f"Annotation - cleaning command: {clean_command}")
 5459                run_parallel_commands([clean_command], 1)
 5460
 5461    # Parquet
 5462    def annotation_parquet(self, threads: int = None) -> None:
 5463        """
 5464        It takes a VCF file, and annotates it with a parquet file
 5465
 5466        :param threads: number of threads to use for the annotation
 5467        :return: the value of the variable "result".
 5468        """
 5469
 5470        # DEBUG
 5471        log.debug("Start annotation with parquet databases")
 5472
 5473        # Threads
 5474        if not threads:
 5475            threads = self.get_threads()
 5476        log.debug("Threads: " + str(threads))
 5477
 5478        # DEBUG
 5479        delete_tmp = True
 5480        if self.get_config().get("verbosity", "warning") in ["debug"]:
 5481            delete_tmp = False
 5482            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 5483
 5484        # Config
 5485        databases_folders = set(
 5486            self.get_config()
 5487            .get("folders", {})
 5488            .get("databases", {})
 5489            .get("annotations", ["."])
 5490            + self.get_config()
 5491            .get("folders", {})
 5492            .get("databases", {})
 5493            .get("parquet", ["."])
 5494        )
 5495        log.debug("Databases annotations: " + str(databases_folders))
 5496
 5497        # Param
 5498        annotations = (
 5499            self.get_param()
 5500            .get("annotation", {})
 5501            .get("parquet", {})
 5502            .get("annotations", None)
 5503        )
 5504        log.debug("Annotations: " + str(annotations))
 5505
 5506        # Assembly
 5507        assembly = self.get_param().get(
 5508            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
 5509        )
 5510
 5511        # Force Update Annotation
 5512        force_update_annotation = (
 5513            self.get_param()
 5514            .get("annotation", {})
 5515            .get("options", {})
 5516            .get("annotations_update", False)
 5517        )
 5518        log.debug(f"force_update_annotation={force_update_annotation}")
 5519        force_append_annotation = (
 5520            self.get_param()
 5521            .get("annotation", {})
 5522            .get("options", {})
 5523            .get("annotations_append", False)
 5524        )
 5525        log.debug(f"force_append_annotation={force_append_annotation}")
 5526
 5527        # Data
 5528        table_variants = self.get_table_variants()
 5529
 5530        # Check if not empty
 5531        log.debug("Check if not empty")
 5532        sql_query_chromosomes_df = self.get_query_to_df(
 5533            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
 5534        )
 5535        if not sql_query_chromosomes_df["count"][0]:
 5536            log.info(f"VCF empty")
 5537            return
 5538
 5539        # VCF header
 5540        vcf_reader = self.get_header()
 5541        log.debug("Initial header: " + str(vcf_reader.infos))
 5542
 5543        # Nb Variants POS
 5544        log.debug("NB Variants Start")
 5545        nb_variants = self.conn.execute(
 5546            f"SELECT count(*) AS count FROM variants"
 5547        ).fetchdf()["count"][0]
 5548        log.debug("NB Variants Stop")
 5549
 5550        # Existing annotations
 5551        for vcf_annotation in self.get_header().infos:
 5552
 5553            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 5554            log.debug(
 5555                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 5556            )
 5557
 5558        # Added columns
 5559        added_columns = []
 5560
 5561        # drop indexes
 5562        log.debug(f"Drop indexes...")
 5563        self.drop_indexes()
 5564
 5565        if annotations:
 5566
 5567            if "ALL" in annotations:
 5568
 5569                all_param = annotations.get("ALL", {})
 5570                all_param_formats = all_param.get("formats", None)
 5571                all_param_releases = all_param.get("releases", None)
 5572
 5573                databases_infos_dict = self.scan_databases(
 5574                    database_formats=all_param_formats,
 5575                    database_releases=all_param_releases,
 5576                )
 5577                for database_infos in databases_infos_dict.keys():
 5578                    if database_infos not in annotations:
 5579                        annotations[database_infos] = {"INFO": None}
 5580
 5581            for annotation in annotations:
 5582
 5583                if annotation in ["ALL"]:
 5584                    continue
 5585
 5586                # Annotation Name
 5587                annotation_name = os.path.basename(annotation)
 5588
 5589                # Annotation fields
 5590                annotation_fields = annotations[annotation]
 5591                if not annotation_fields:
 5592                    annotation_fields = {"INFO": None}
 5593
 5594                log.debug(f"Annotation '{annotation_name}'")
 5595                log.debug(
 5596                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
 5597                )
 5598
 5599                # Create Database
 5600                database = Database(
 5601                    database=annotation,
 5602                    databases_folders=databases_folders,
 5603                    assembly=assembly,
 5604                )
 5605
 5606                # Find files
 5607                parquet_file = database.get_database()
 5608                parquet_hdr_file = database.get_header_file()
 5609                parquet_type = database.get_type()
 5610
 5611                # Check if files exists
 5612                if not parquet_file or not parquet_hdr_file:
 5613                    log.error("Annotation failed: file not found")
 5614                    raise ValueError("Annotation failed: file not found")
 5615                else:
 5616                    # Get parquet connexion
 5617                    parquet_sql_attach = database.get_sql_database_attach(
 5618                        output="query"
 5619                    )
 5620                    if parquet_sql_attach:
 5621                        self.conn.execute(parquet_sql_attach)
 5622                    parquet_file_link = database.get_sql_database_link()
 5623                    # Log
 5624                    log.debug(
 5625                        f"Annotation '{annotation_name}' - file: "
 5626                        + str(parquet_file)
 5627                        + " and "
 5628                        + str(parquet_hdr_file)
 5629                    )
 5630
 5631                    # Database full header columns
 5632                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
 5633                        parquet_hdr_file
 5634                    )
 5635                    # Log
 5636                    log.debug(
 5637                        "Annotation database header columns : "
 5638                        + str(parquet_hdr_vcf_header_columns)
 5639                    )
 5640
 5641                    # Load header as VCF object
 5642                    parquet_hdr_vcf_header_infos = database.get_header().infos
 5643                    # Log
 5644                    log.debug(
 5645                        "Annotation database header: "
 5646                        + str(parquet_hdr_vcf_header_infos)
 5647                    )
 5648
 5649                    # Get extra infos
 5650                    parquet_columns = database.get_extra_columns()
 5651                    # Log
 5652                    log.debug("Annotation database Columns: " + str(parquet_columns))
 5653
 5654                    # Add extra columns if "ALL" in annotation_fields
 5655                    # if "ALL" in annotation_fields:
 5656                    #     allow_add_extra_column = True
 5657                    if "ALL" in annotation_fields and database.get_extra_columns():
 5658                        for extra_column in database.get_extra_columns():
 5659                            if (
 5660                                extra_column not in annotation_fields
 5661                                and extra_column.replace("INFO/", "")
 5662                                not in parquet_hdr_vcf_header_infos
 5663                            ):
 5664                                parquet_hdr_vcf_header_infos[extra_column] = (
 5665                                    vcf.parser._Info(
 5666                                        extra_column,
 5667                                        ".",
 5668                                        "String",
 5669                                        f"{extra_column} description",
 5670                                        "unknown",
 5671                                        "unknown",
 5672                                        self.code_type_map["String"],
 5673                                    )
 5674                                )
 5675
 5676                    # For all fields in database
 5677                    annotation_fields_all = False
 5678                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
 5679                        annotation_fields_all = True
 5680                        annotation_fields = {
 5681                            key: key for key in parquet_hdr_vcf_header_infos
 5682                        }
 5683
 5684                        log.debug(
 5685                            "Annotation database header - All annotations added: "
 5686                            + str(annotation_fields)
 5687                        )
 5688
 5689                    # Init
 5690
 5691                    # List of annotation fields to use
 5692                    sql_query_annotation_update_info_sets = []
 5693
 5694                    # List of annotation to agregate
 5695                    sql_query_annotation_to_agregate = []
 5696
 5697                    # Number of fields
 5698                    nb_annotation_field = 0
 5699
 5700                    # Annotation fields processed
 5701                    annotation_fields_processed = []
 5702
 5703                    # Columns mapping
 5704                    map_columns = database.map_columns(
 5705                        columns=annotation_fields, prefixes=["INFO/"]
 5706                    )
 5707
 5708                    # Query dict for fields to remove (update option)
 5709                    query_dict_remove = {}
 5710
 5711                    # Fetch Anotation fields
 5712                    for annotation_field in annotation_fields:
 5713
 5714                        # annotation_field_column
 5715                        annotation_field_column = map_columns.get(
 5716                            annotation_field, "INFO"
 5717                        )
 5718
 5719                        # field new name, if parametered
 5720                        annotation_fields_new_name = annotation_fields.get(
 5721                            annotation_field, annotation_field
 5722                        )
 5723                        if not annotation_fields_new_name:
 5724                            annotation_fields_new_name = annotation_field
 5725
 5726                        # To annotate
 5727                        # force_update_annotation = True
 5728                        # force_append_annotation = True
 5729                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
 5730                        if annotation_field in parquet_hdr_vcf_header_infos and (
 5731                            force_update_annotation
 5732                            or force_append_annotation
 5733                            or (
 5734                                annotation_fields_new_name
 5735                                not in self.get_header().infos
 5736                            )
 5737                        ):
 5738
 5739                            # Add field to annotation to process list
 5740                            annotation_fields_processed.append(
 5741                                annotation_fields_new_name
 5742                            )
 5743
 5744                            # explode infos for the field
 5745                            annotation_fields_new_name_info_msg = ""
 5746                            if (
 5747                                force_update_annotation
 5748                                and annotation_fields_new_name
 5749                                in self.get_header().infos
 5750                            ):
 5751                                # Remove field from INFO
 5752                                query = f"""
 5753                                    UPDATE {table_variants} as table_variants
 5754                                    SET INFO = REGEXP_REPLACE(
 5755                                                concat(table_variants.INFO,''),
 5756                                                ';*{annotation_fields_new_name}=[^;]*',
 5757                                                ''
 5758                                                )
 5759                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
 5760                                """
 5761                                annotation_fields_new_name_info_msg = " [update]"
 5762                                query_dict_remove[
 5763                                    f"remove 'INFO/{annotation_fields_new_name}'"
 5764                                ] = query
 5765
 5766                            # Sep between fields in INFO
 5767                            nb_annotation_field += 1
 5768                            if nb_annotation_field > 1:
 5769                                annotation_field_sep = ";"
 5770                            else:
 5771                                annotation_field_sep = ""
 5772
 5773                            log.info(
 5774                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
 5775                            )
 5776
 5777                            # Add INFO field to header
 5778                            parquet_hdr_vcf_header_infos_number = (
 5779                                parquet_hdr_vcf_header_infos[annotation_field].num
 5780                                or "."
 5781                            )
 5782                            parquet_hdr_vcf_header_infos_type = (
 5783                                parquet_hdr_vcf_header_infos[annotation_field].type
 5784                                or "String"
 5785                            )
 5786                            parquet_hdr_vcf_header_infos_description = (
 5787                                parquet_hdr_vcf_header_infos[annotation_field].desc
 5788                                or f"{annotation_field} description"
 5789                            )
 5790                            parquet_hdr_vcf_header_infos_source = (
 5791                                parquet_hdr_vcf_header_infos[annotation_field].source
 5792                                or "unknown"
 5793                            )
 5794                            parquet_hdr_vcf_header_infos_version = (
 5795                                parquet_hdr_vcf_header_infos[annotation_field].version
 5796                                or "unknown"
 5797                            )
 5798
 5799                            vcf_reader.infos[annotation_fields_new_name] = (
 5800                                vcf.parser._Info(
 5801                                    annotation_fields_new_name,
 5802                                    parquet_hdr_vcf_header_infos_number,
 5803                                    parquet_hdr_vcf_header_infos_type,
 5804                                    parquet_hdr_vcf_header_infos_description,
 5805                                    parquet_hdr_vcf_header_infos_source,
 5806                                    parquet_hdr_vcf_header_infos_version,
 5807                                    self.code_type_map[
 5808                                        parquet_hdr_vcf_header_infos_type
 5809                                    ],
 5810                                )
 5811                            )
 5812
 5813                            # Append
 5814                            if force_append_annotation:
 5815                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
 5816                            else:
 5817                                query_case_when_append = ""
 5818
 5819                            # Annotation/Update query fields
 5820                            # Found in INFO column
 5821                            if (
 5822                                annotation_field_column == "INFO"
 5823                                and "INFO" in parquet_hdr_vcf_header_columns
 5824                            ):
 5825                                sql_query_annotation_update_info_sets.append(
 5826                                    f"""
 5827                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
 5828                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
 5829                                        ELSE ''
 5830                                    END
 5831                                """
 5832                                )
 5833                            # Found in a specific column
 5834                            else:
 5835                                sql_query_annotation_update_info_sets.append(
 5836                                    f"""
 5837                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
 5838                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
 5839                                        ELSE ''
 5840                                    END
 5841                                """
 5842                                )
 5843                                sql_query_annotation_to_agregate.append(
 5844                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
 5845                                )
 5846
 5847                        # Not to annotate
 5848                        else:
 5849
 5850                            if force_update_annotation:
 5851                                annotation_message = "forced"
 5852                            else:
 5853                                annotation_message = "skipped"
 5854
 5855                            if annotation_field not in parquet_hdr_vcf_header_infos:
 5856                                log.warning(
 5857                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
 5858                                )
 5859                            if annotation_fields_new_name in self.get_header().infos:
 5860                                log.warning(
 5861                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
 5862                                )
 5863
 5864                    # Check if ALL fields have to be annotated. Thus concat all INFO field
 5865                    # allow_annotation_full_info = True
 5866                    allow_annotation_full_info = not force_append_annotation
 5867
 5868                    if parquet_type in ["regions"]:
 5869                        allow_annotation_full_info = False
 5870
 5871                    if (
 5872                        allow_annotation_full_info
 5873                        and nb_annotation_field == len(annotation_fields)
 5874                        and annotation_fields_all
 5875                        and (
 5876                            "INFO" in parquet_hdr_vcf_header_columns
 5877                            and "INFO" in database.get_extra_columns()
 5878                        )
 5879                    ):
 5880                        log.debug("Column INFO annotation enabled")
 5881                        sql_query_annotation_update_info_sets = []
 5882                        sql_query_annotation_update_info_sets.append(
 5883                            f" table_parquet.INFO "
 5884                        )
 5885
 5886                    if sql_query_annotation_update_info_sets:
 5887
 5888                        # Annotate
 5889                        log.info(f"Annotation '{annotation_name}' - Annotation...")
 5890
 5891                        # Join query annotation update info sets for SQL
 5892                        sql_query_annotation_update_info_sets_sql = ",".join(
 5893                            sql_query_annotation_update_info_sets
 5894                        )
 5895
 5896                        # Check chromosomes list (and variants infos)
 5897                        sql_query_chromosomes = f"""
 5898                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
 5899                            FROM {table_variants} as table_variants
 5900                            GROUP BY table_variants."#CHROM"
 5901                            ORDER BY table_variants."#CHROM"
 5902                            """
 5903                        sql_query_chromosomes_df = self.conn.execute(
 5904                            sql_query_chromosomes
 5905                        ).df()
 5906                        sql_query_chromosomes_dict = {
 5907                            entry["CHROM"]: {
 5908                                "count": entry["count_variants"],
 5909                                "min": entry["min_variants"],
 5910                                "max": entry["max_variants"],
 5911                            }
 5912                            for index, entry in sql_query_chromosomes_df.iterrows()
 5913                        }
 5914
 5915                        # Init
 5916                        nb_of_query = 0
 5917                        nb_of_variant_annotated = 0
 5918                        query_dict = query_dict_remove
 5919
 5920                        # for chrom in sql_query_chromosomes_df["CHROM"]:
 5921                        for chrom in sql_query_chromosomes_dict:
 5922
 5923                            # Number of variant by chromosome
 5924                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
 5925                                chrom, {}
 5926                            ).get("count", 0)
 5927
 5928                            log.debug(
 5929                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
 5930                            )
 5931
 5932                            # Annotation with regions database
 5933                            if parquet_type in ["regions"]:
 5934                                sql_query_annotation_from_clause = f"""
 5935                                    FROM (
 5936                                        SELECT 
 5937                                            '{chrom}' AS \"#CHROM\",
 5938                                            table_variants_from.\"POS\" AS \"POS\",
 5939                                            {",".join(sql_query_annotation_to_agregate)}
 5940                                        FROM {table_variants} as table_variants_from
 5941                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
 5942                                            table_parquet_from."#CHROM" = '{chrom}'
 5943                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
 5944                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
 5945                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
 5946                                                )
 5947                                        )
 5948                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
 5949                                        GROUP BY table_variants_from.\"POS\"
 5950                                        )
 5951                                        as table_parquet
 5952                                """
 5953
 5954                                sql_query_annotation_where_clause = """
 5955                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
 5956                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5957                                """
 5958
 5959                            # Annotation with variants database
 5960                            else:
 5961                                sql_query_annotation_from_clause = f"""
 5962                                    FROM {parquet_file_link} as table_parquet
 5963                                """
 5964                                sql_query_annotation_where_clause = f"""
 5965                                    table_variants."#CHROM" = '{chrom}'
 5966                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
 5967                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
 5968                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
 5969                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
 5970                                """
 5971
 5972                            # Create update query
 5973                            sql_query_annotation_chrom_interval_pos = f"""
 5974                                UPDATE {table_variants} as table_variants
 5975                                    SET INFO = 
 5976                                        concat(
 5977                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5978                                                THEN table_variants.INFO
 5979                                                ELSE ''
 5980                                            END
 5981                                            ,
 5982                                            CASE WHEN table_variants.INFO NOT IN ('','.')
 5983                                                        AND (
 5984                                                        concat({sql_query_annotation_update_info_sets_sql})
 5985                                                        )
 5986                                                        NOT IN ('','.') 
 5987                                                    THEN ';'
 5988                                                    ELSE ''
 5989                                            END
 5990                                            ,
 5991                                            {sql_query_annotation_update_info_sets_sql}
 5992                                            )
 5993                                    {sql_query_annotation_from_clause}
 5994                                    WHERE {sql_query_annotation_where_clause}
 5995                                    ;
 5996                                """
 5997
 5998                            # Add update query to dict
 5999                            query_dict[
 6000                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
 6001                            ] = sql_query_annotation_chrom_interval_pos
 6002
 6003                        nb_of_query = len(query_dict)
 6004                        num_query = 0
 6005
 6006                        # SET max_expression_depth TO x
 6007                        self.conn.execute("SET max_expression_depth TO 10000")
 6008
 6009                        for query_name in query_dict:
 6010                            query = query_dict[query_name]
 6011                            num_query += 1
 6012                            log.info(
 6013                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
 6014                            )
 6015                            result = self.conn.execute(query)
 6016                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
 6017                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
 6018                            log.info(
 6019                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
 6020                            )
 6021
 6022                        log.info(
 6023                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
 6024                        )
 6025
 6026                    else:
 6027
 6028                        log.info(
 6029                            f"Annotation '{annotation_name}' - No Annotations available"
 6030                        )
 6031
 6032                    log.debug("Final header: " + str(vcf_reader.infos))
 6033
 6034        # Remove added columns
 6035        for added_column in added_columns:
 6036            self.drop_column(column=added_column)
 6037
 6038    def annotation_splice(self, threads: int = None) -> None:
 6039        """
 6040        This function annotate with snpEff
 6041
 6042        :param threads: The number of threads to use
 6043        :return: the value of the variable "return_value".
 6044        """
 6045
 6046        # DEBUG
 6047        log.debug("Start annotation with splice tools")
 6048
 6049        # Threads
 6050        if not threads:
 6051            threads = self.get_threads()
 6052        log.debug("Threads: " + str(threads))
 6053
 6054        # DEBUG
 6055        delete_tmp = True
 6056        if self.get_config().get("verbosity", "warning") in ["debug"]:
 6057            delete_tmp = False
 6058            log.debug("Delete tmp files/folders: " + str(delete_tmp))
 6059
 6060        # Config
 6061        config = self.get_config()
 6062        log.debug("Config: " + str(config))
 6063        splice_config = config.get("tools", {}).get("splice", {})
 6064        if not splice_config:
 6065            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
 6066        if not splice_config:
 6067            msg_err = "No Splice tool config"
 6068            log.error(msg_err)
 6069            raise ValueError(msg_err)
 6070        log.debug(f"splice_config={splice_config}")
 6071
 6072        # Config - Folders - Databases
 6073        databases_folders = (
 6074            config.get("folders", {}).get("databases", {}).get("splice", ["."])
 6075        )
 6076        log.debug("Databases annotations: " + str(databases_folders))
 6077
 6078        # Splice docker image
 6079        splice_docker_image = splice_config.get("docker").get("image")
 6080
 6081        # Pull splice image if it's not already there
 6082        if not check_docker_image_exists(splice_docker_image):
 6083            log.warning(
 6084                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
 6085            )
 6086            try:
 6087                command(f"docker pull {splice_config.get('docker').get('image')}")
 6088            except subprocess.CalledProcessError:
 6089                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
 6090                log.error(msg_err)
 6091                raise ValueError(msg_err)
 6092                return None
 6093
 6094        # Config - splice databases
 6095        splice_databases = (
 6096            config.get("folders", {})
 6097            .get("databases", {})
 6098            .get("splice", DEFAULT_SPLICE_FOLDER)
 6099        )
 6100        splice_databases = full_path(splice_databases)
 6101
 6102        # Param
 6103        param = self.get_param()
 6104        log.debug("Param: " + str(param))
 6105
 6106        # Param
 6107        options = param.get("annotation", {}).get("splice", {})
 6108        log.debug("Options: " + str(options))
 6109
 6110        # Data
 6111        table_variants = self.get_table_variants()
 6112
 6113        # Check if not empty
 6114        log.debug("Check if not empty")
 6115        sql_query_chromosomes = (
 6116            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
 6117        )
 6118        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
 6119            log.info("VCF empty")
 6120            return None
 6121
 6122        # Export in VCF
 6123        log.debug("Create initial file to annotate")
 6124
 6125        # Create output folder
 6126        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
 6127        if not os.path.exists(output_folder):
 6128            Path(output_folder).mkdir(parents=True, exist_ok=True)
 6129
 6130        # Create tmp VCF file
 6131        tmp_vcf = NamedTemporaryFile(
 6132            prefix=self.get_prefix(),
 6133            dir=output_folder,
 6134            suffix=".vcf",
 6135            delete=False,
 6136        )
 6137        tmp_vcf_name = tmp_vcf.name
 6138
 6139        # VCF header
 6140        header = self.get_header()
 6141
 6142        # Existing annotations
 6143        for vcf_annotation in self.get_header().infos:
 6144
 6145            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
 6146            log.debug(
 6147                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
 6148            )
 6149
 6150        # Memory limit
 6151        if config.get("memory", None):
 6152            memory_limit = config.get("memory", "8G").upper()
 6153            # upper()
 6154        else:
 6155            memory_limit = "8G"
 6156        log.debug(f"memory_limit: {memory_limit}")
 6157
 6158        # Check number of variants to annotate
 6159        where_clause_regex_spliceai = r"SpliceAI_\w+"
 6160        where_clause_regex_spip = r"SPiP_\w+"
 6161        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
 6162        df_list_of_variants_to_annotate = self.get_query_to_df(
 6163            query=f""" SELECT * FROM variants {where_clause} """
 6164        )
 6165        if len(df_list_of_variants_to_annotate) == 0:
 6166            log.warning(
 6167                f"No variants to annotate with splice. Variants probably already annotated with splice"
 6168            )
 6169            return None
 6170        else:
 6171            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
 6172
 6173        # Export VCF file
 6174        self.export_variant_vcf(
 6175            vcf_file=tmp_vcf_name,
 6176            remove_info=True,
 6177            add_samples=True,
 6178            index=False,
 6179            where_clause=where_clause,
 6180        )
 6181
 6182        # Create docker container and launch splice analysis
 6183        if splice_config:
 6184
 6185            # Splice mount folders
 6186            mount_folders = splice_config.get("mount", {})
 6187
 6188            # Genome mount
 6189            mount_folders[
 6190                config.get("folders", {})
 6191                .get("databases", {})
 6192                .get("genomes", DEFAULT_GENOME_FOLDER)
 6193            ] = "ro"
 6194
 6195            # SpliceAI mount
 6196            mount_folders[
 6197                config.get("folders", {})
 6198                .get("databases", {})
 6199                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
 6200            ] = "ro"
 6201
 6202            # Genome mount
 6203            mount_folders[
 6204                config.get("folders", {})
 6205                .get("databases", {})
 6206                .get("spip", DEFAULT_SPIP_FOLDER)
 6207            ] = "ro"
 6208
 6209            # Mount folders
 6210            mount = []
 6211
 6212            # Config mount
 6213            mount = [
 6214                f"-v {full_path(path)}:{full_path(path)}:{mode}"
 6215                for path, mode in mount_folders.items()
 6216            ]
 6217
 6218            if any(value for value in splice_config.values() if value is None):
 6219                log.warning("At least one splice config parameter is empty")
 6220                return None
 6221
 6222            # Params in splice nf
 6223            def check_values(dico: dict):
 6224                """
 6225                Ensure parameters for NF splice pipeline
 6226                """
 6227                for key, val in dico.items():
 6228                    if key == "genome":
 6229                        if any(
 6230                            assemb in options.get("genome", {})
 6231                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
 6232                        ):
 6233                            yield f"--{key} hg19"
 6234                        elif any(
 6235                            assemb in options.get("genome", {})
 6236                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
 6237                        ):
 6238                            yield f"--{key} hg38"
 6239                    elif (
 6240                        (isinstance(val, str) and val)
 6241                        or isinstance(val, int)
 6242                        or isinstance(val, bool)
 6243                    ):
 6244                        yield f"--{key} {val}"
 6245
 6246            # Genome
 6247            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
 6248            options["genome"] = genome
 6249
 6250            # NF params
 6251            nf_params = []
 6252
 6253            # Add options
 6254            if options:
 6255                nf_params = list(check_values(options))
 6256                log.debug(f"Splice NF params: {' '.join(nf_params)}")
 6257            else:
 6258                log.debug("No NF params provided")
 6259
 6260            # Add threads
 6261            if "threads" not in options.keys():
 6262                nf_params.append(f"--threads {threads}")
 6263
 6264            # Genome path
 6265            genome_path = find_genome(
 6266                config.get("folders", {})
 6267                .get("databases", {})
 6268                .get("genomes", DEFAULT_GENOME_FOLDER),
 6269                file=f"{genome}.fa",
 6270            )
 6271            # Add genome path
 6272            if not genome_path:
 6273                raise ValueError(
 6274                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
 6275                )
 6276            else:
 6277                log.debug(f"Genome: {genome_path}")
 6278                nf_params.append(f"--genome_path {genome_path}")
 6279
 6280            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
 6281                """
 6282                Setting up updated databases for SPiP and SpliceAI
 6283                """
 6284
 6285                try:
 6286
 6287                    # SpliceAI assembly transcriptome
 6288                    spliceai_assembly = os.path.join(
 6289                        config.get("folders", {})
 6290                        .get("databases", {})
 6291                        .get("spliceai", {}),
 6292                        options.get("genome"),
 6293                        "transcriptome",
 6294                    )
 6295                    spip_assembly = options.get("genome")
 6296
 6297                    spip = find(
 6298                        f"transcriptome_{spip_assembly}.RData",
 6299                        config.get("folders", {}).get("databases", {}).get("spip", {}),
 6300                    )
 6301                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
 6302                    log.debug(f"SPiP annotations: {spip}")
 6303                    log.debug(f"SpliceAI annotations: {spliceai}")
 6304                    if spip and spliceai:
 6305                        return [
 6306                            f"--spip_transcriptome {spip}",
 6307                            f"--spliceai_annotations {spliceai}",
 6308                        ]
 6309                    else:
 6310                        # TODO crash and go on with basic annotations ?
 6311                        # raise ValueError(
 6312                        #     "Can't find splice databases in configuration EXIT"
 6313                        # )
 6314                        log.warning(
 6315                            "Can't find splice databases in configuration, use annotations file from image"
 6316                        )
 6317                except TypeError:
 6318                    log.warning(
 6319                        "Can't find splice databases in configuration, use annotations file from image"
 6320                    )
 6321                    return []
 6322
 6323            # Add options, check if transcriptome option have already beend provided
 6324            if (
 6325                "spip_transcriptome" not in nf_params
 6326                and "spliceai_transcriptome" not in nf_params
 6327            ):
 6328                splice_reference = splice_annotations(options, config)
 6329                if splice_reference:
 6330                    nf_params.extend(splice_reference)
 6331
 6332            nf_params.append(f"--output_folder {output_folder}")
 6333
 6334            random_uuid = f"HOWARD-SPLICE-{get_random()}"
 6335            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
 6336            log.debug(cmd)
 6337
 6338            splice_config["docker"]["command"] = cmd
 6339
 6340            docker_cmd = get_bin_command(
 6341                tool="splice",
 6342                bin_type="docker",
 6343                config=config,
 6344                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
 6345                add_options=f"--name {random_uuid} {' '.join(mount)}",
 6346            )
 6347
 6348            # Docker debug
 6349            # if splice_config.get("rm_container"):
 6350            #     rm_container = "--rm"
 6351            # else:
 6352            #     rm_container = ""
 6353            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
 6354
 6355            log.debug(docker_cmd)
 6356            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
 6357            log.debug(res.stdout)
 6358            if res.stderr:
 6359                log.error(res.stderr)
 6360            res.check_returncode()
 6361        else:
 6362            log.warning(f"Splice tool configuration not found: {config}")
 6363
 6364        # Update variants
 6365        log.info("Annotation - Updating...")
 6366        # Test find output vcf
 6367        log.debug(
 6368            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6369        )
 6370        output_vcf = []
 6371        # Wrong folder to look in
 6372        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
 6373            if (
 6374                files
 6375                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
 6376            ):
 6377                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
 6378        # log.debug(os.listdir(options.get("output_folder")))
 6379        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
 6380        if not output_vcf:
 6381            log.debug(
 6382                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
 6383            )
 6384        else:
 6385            # Get new header from annotated vcf
 6386            log.debug(f"Initial header: {len(header.infos)} fields")
 6387            # Create new header with splice infos
 6388            new_vcf = Variants(input=output_vcf[0])
 6389            new_vcf_header = new_vcf.get_header().infos
 6390            for keys, infos in new_vcf_header.items():
 6391                if keys not in header.infos.keys():
 6392                    header.infos[keys] = infos
 6393            log.debug(f"New header: {len(header.infos)} fields")
 6394            log.debug(f"Splice tmp output: {output_vcf[0]}")
 6395            self.update_from_vcf(output_vcf[0])
 6396
 6397        # Remove folder
 6398        remove_if_exists(output_folder)
 6399
 6400    ###
 6401    # Prioritization
 6402    ###
 6403
 6404    def get_config_default(self, name: str) -> dict:
 6405        """
 6406        The function `get_config_default` returns a dictionary containing default configurations for
 6407        various calculations and prioritizations.
 6408
 6409        :param name: The `get_config_default` function returns a dictionary containing default
 6410        configurations for different calculations and prioritizations. The `name` parameter is used to
 6411        specify which specific configuration to retrieve from the dictionary
 6412        :type name: str
 6413        :return: The function `get_config_default` returns a dictionary containing default configuration
 6414        settings for different calculations and prioritizations. The specific configuration settings are
 6415        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
 6416        matches a key in the `config_default` dictionary, the corresponding configuration settings are
 6417        returned. If there is no match, an empty dictionary is returned.
 6418        """
 6419
 6420        config_default = {
 6421            "calculations": {
 6422                "variant_chr_pos_alt_ref": {
 6423                    "type": "sql",
 6424                    "name": "variant_chr_pos_alt_ref",
 6425                    "description": "Create a variant ID with chromosome, position, alt and ref",
 6426                    "available": False,
 6427                    "output_column_name": "variant_chr_pos_alt_ref",
 6428                    "output_column_type": "String",
 6429                    "output_column_description": "variant ID with chromosome, position, alt and ref",
 6430                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
 6431                    "operation_info": True,
 6432                },
 6433                "VARTYPE": {
 6434                    "type": "sql",
 6435                    "name": "VARTYPE",
 6436                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
 6437                    "available": True,
 6438                    "output_column_name": "VARTYPE",
 6439                    "output_column_type": "String",
 6440                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
 6441                    "operation_query": """
 6442                            CASE
 6443                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
 6444                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
 6445                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
 6446                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
 6447                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
 6448                                ELSE 'UNDEFINED'
 6449                            END
 6450                            """,
 6451                    "info_fields": ["SVTYPE"],
 6452                    "operation_info": True,
 6453                },
 6454                "snpeff_hgvs": {
 6455                    "type": "python",
 6456                    "name": "snpeff_hgvs",
 6457                    "description": "HGVS nomenclatures from snpEff annotation",
 6458                    "available": True,
 6459                    "function_name": "calculation_extract_snpeff_hgvs",
 6460                    "function_params": ["snpeff_hgvs", "ANN"],
 6461                },
 6462                "snpeff_ann_explode": {
 6463                    "type": "python",
 6464                    "name": "snpeff_ann_explode",
 6465                    "description": "Explode snpEff annotations with uniquify values",
 6466                    "available": True,
 6467                    "function_name": "calculation_snpeff_ann_explode",
 6468                    "function_params": [False, "fields", "snpeff_", "ANN"],
 6469                },
 6470                "snpeff_ann_explode_uniquify": {
 6471                    "type": "python",
 6472                    "name": "snpeff_ann_explode_uniquify",
 6473                    "description": "Explode snpEff annotations",
 6474                    "available": True,
 6475                    "function_name": "calculation_snpeff_ann_explode",
 6476                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
 6477                },
 6478                "snpeff_ann_explode_json": {
 6479                    "type": "python",
 6480                    "name": "snpeff_ann_explode_json",
 6481                    "description": "Explode snpEff annotations in JSON format",
 6482                    "available": True,
 6483                    "function_name": "calculation_snpeff_ann_explode",
 6484                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
 6485                },
 6486                "NOMEN": {
 6487                    "type": "python",
 6488                    "name": "NOMEN",
 6489                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
 6490                    "available": True,
 6491                    "function_name": "calculation_extract_nomen",
 6492                    "function_params": [],
 6493                },
 6494                "FINDBYPIPELINE": {
 6495                    "type": "python",
 6496                    "name": "FINDBYPIPELINE",
 6497                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
 6498                    "available": True,
 6499                    "function_name": "calculation_find_by_pipeline",
 6500                    "function_params": ["findbypipeline"],
 6501                },
 6502                "FINDBYSAMPLE": {
 6503                    "type": "python",
 6504                    "name": "FINDBYSAMPLE",
 6505                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
 6506                    "available": True,
 6507                    "function_name": "calculation_find_by_pipeline",
 6508                    "function_params": ["findbysample"],
 6509                },
 6510                "GENOTYPECONCORDANCE": {
 6511                    "type": "python",
 6512                    "name": "GENOTYPECONCORDANCE",
 6513                    "description": "Concordance of genotype for multi caller VCF",
 6514                    "available": True,
 6515                    "function_name": "calculation_genotype_concordance",
 6516                    "function_params": [],
 6517                },
 6518                "BARCODE": {
 6519                    "type": "python",
 6520                    "name": "BARCODE",
 6521                    "description": "BARCODE as VaRank tool",
 6522                    "available": True,
 6523                    "function_name": "calculation_barcode",
 6524                    "function_params": [],
 6525                },
 6526                "BARCODEFAMILY": {
 6527                    "type": "python",
 6528                    "name": "BARCODEFAMILY",
 6529                    "description": "BARCODEFAMILY as VaRank tool",
 6530                    "available": True,
 6531                    "function_name": "calculation_barcode_family",
 6532                    "function_params": ["BCF"],
 6533                },
 6534                "TRIO": {
 6535                    "type": "python",
 6536                    "name": "TRIO",
 6537                    "description": "Inheritance for a trio family",
 6538                    "available": True,
 6539                    "function_name": "calculation_trio",
 6540                    "function_params": [],
 6541                },
 6542                "VAF": {
 6543                    "type": "python",
 6544                    "name": "VAF",
 6545                    "description": "Variant Allele Frequency (VAF) harmonization",
 6546                    "available": True,
 6547                    "function_name": "calculation_vaf_normalization",
 6548                    "function_params": [],
 6549                },
 6550                "VAF_stats": {
 6551                    "type": "python",
 6552                    "name": "VAF_stats",
 6553                    "description": "Variant Allele Frequency (VAF) statistics",
 6554                    "available": True,
 6555                    "function_name": "calculation_genotype_stats",
 6556                    "function_params": ["VAF"],
 6557                },
 6558                "DP_stats": {
 6559                    "type": "python",
 6560                    "name": "DP_stats",
 6561                    "description": "Depth (DP) statistics",
 6562                    "available": True,
 6563                    "function_name": "calculation_genotype_stats",
 6564                    "function_params": ["DP"],
 6565                },
 6566                "variant_id": {
 6567                    "type": "python",
 6568                    "name": "variant_id",
 6569                    "description": "Variant ID generated from variant position and type",
 6570                    "available": True,
 6571                    "function_name": "calculation_variant_id",
 6572                    "function_params": [],
 6573                },
 6574                "transcripts_json": {
 6575                    "type": "python",
 6576                    "name": "transcripts_json",
 6577                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
 6578                    "available": True,
 6579                    "function_name": "calculation_transcripts_annotation",
 6580                    "function_params": ["transcripts_json", None],
 6581                },
 6582                "transcripts_ann": {
 6583                    "type": "python",
 6584                    "name": "transcripts_ann",
 6585                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
 6586                    "available": True,
 6587                    "function_name": "calculation_transcripts_annotation",
 6588                    "function_params": [None, "transcripts_ann"],
 6589                },
 6590                "transcripts_annotations": {
 6591                    "type": "python",
 6592                    "name": "transcripts_annotations",
 6593                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
 6594                    "available": True,
 6595                    "function_name": "calculation_transcripts_annotation",
 6596                    "function_params": [None, None],
 6597                },
 6598                "transcripts_prioritization": {
 6599                    "type": "python",
 6600                    "name": "transcripts_prioritization",
 6601                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
 6602                    "available": True,
 6603                    "function_name": "calculation_transcripts_prioritization",
 6604                    "function_params": [],
 6605                },
 6606            },
 6607            "prioritizations": {
 6608                "default": {
 6609                    "ANN2": [
 6610                        {
 6611                            "type": "contains",
 6612                            "value": "HIGH",
 6613                            "score": 5,
 6614                            "flag": "PASS",
 6615                            "comment": [
 6616                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
 6617                            ],
 6618                        },
 6619                        {
 6620                            "type": "contains",
 6621                            "value": "MODERATE",
 6622                            "score": 3,
 6623                            "flag": "PASS",
 6624                            "comment": [
 6625                                "A non-disruptive variant that might change protein effectiveness"
 6626                            ],
 6627                        },
 6628                        {
 6629                            "type": "contains",
 6630                            "value": "LOW",
 6631                            "score": 0,
 6632                            "flag": "FILTERED",
 6633                            "comment": [
 6634                                "Assumed to be mostly harmless or unlikely to change protein behavior"
 6635                            ],
 6636                        },
 6637                        {
 6638                            "type": "contains",
 6639                            "value": "MODIFIER",
 6640                            "score": 0,
 6641                            "flag": "FILTERED",
 6642                            "comment": [
 6643                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
 6644                            ],
 6645                        },
 6646                    ],
 6647                }
 6648            },
 6649        }
 6650
 6651        return config_default.get(name, None)
 6652
 6653    def get_config_json(
 6654        self, name: str, config_dict: dict = {}, config_file: str = None
 6655    ) -> dict:
 6656        """
 6657        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
 6658        default values, a dictionary, and a file.
 6659
 6660        :param name: The `name` parameter in the `get_config_json` function is a string that represents
 6661        the name of the configuration. It is used to identify and retrieve the configuration settings
 6662        for a specific component or module
 6663        :type name: str
 6664        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
 6665        dictionary that allows you to provide additional configuration settings or overrides. When you
 6666        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
 6667        the key is the configuration setting you want to override or
 6668        :type config_dict: dict
 6669        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
 6670        specify the path to a configuration file that contains additional settings. If provided, the
 6671        function will read the contents of this file and update the configuration dictionary with the
 6672        values found in the file, overriding any existing values with the
 6673        :type config_file: str
 6674        :return: The function `get_config_json` returns a dictionary containing the configuration
 6675        settings.
 6676        """
 6677
 6678        # Create with default prioritizations
 6679        config_default = self.get_config_default(name=name)
 6680        configuration = config_default
 6681        # log.debug(f"configuration={configuration}")
 6682
 6683        # Replace prioritizations from dict
 6684        for config in config_dict:
 6685            configuration[config] = config_dict[config]
 6686
 6687        # Replace prioritizations from file
 6688        config_file = full_path(config_file)
 6689        if config_file:
 6690            if os.path.exists(config_file):
 6691                with open(config_file) as config_file_content:
 6692                    config_file_dict = json.load(config_file_content)
 6693                for config in config_file_dict:
 6694                    configuration[config] = config_file_dict[config]
 6695            else:
 6696                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
 6697                log.error(msg_error)
 6698                raise ValueError(msg_error)
 6699
 6700        return configuration
 6701
 6702    def prioritization(
 6703        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
 6704    ) -> bool:
 6705        """
 6706        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
 6707        prioritizes variants based on configured profiles and criteria.
 6708
 6709        :param table: The `table` parameter in the `prioritization` function is used to specify the name
 6710        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
 6711        a table name is provided, the method will prioritize the variants in that specific table
 6712        :type table: str
 6713        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
 6714        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
 6715        provided, the code will use a default prefix value of "PZ"
 6716        :type pz_prefix: str
 6717        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
 6718        additional parameters specific to the prioritization process. These parameters can include
 6719        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
 6720        configurations needed for the prioritization of variants in a V
 6721        :type pz_param: dict
 6722        :return: A boolean value (True) is being returned from the `prioritization` function.
 6723        """
 6724
 6725        # Config
 6726        config = self.get_config()
 6727
 6728        # Param
 6729        param = self.get_param()
 6730
 6731        # Prioritization param
 6732        if pz_param is not None:
 6733            prioritization_param = pz_param
 6734        else:
 6735            prioritization_param = param.get("prioritization", {})
 6736
 6737        # Configuration profiles
 6738        prioritization_config_file = prioritization_param.get(
 6739            "prioritization_config", None
 6740        )
 6741        prioritization_config_file = full_path(prioritization_config_file)
 6742        prioritizations_config = self.get_config_json(
 6743            name="prioritizations", config_file=prioritization_config_file
 6744        )
 6745
 6746        # Prioritization prefix
 6747        pz_prefix_default = "PZ"
 6748        if pz_prefix is None:
 6749            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
 6750
 6751        # Prioritization options
 6752        profiles = prioritization_param.get("profiles", [])
 6753        if isinstance(profiles, str):
 6754            profiles = profiles.split(",")
 6755        pzfields = prioritization_param.get(
 6756            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
 6757        )
 6758        if isinstance(pzfields, str):
 6759            pzfields = pzfields.split(",")
 6760        default_profile = prioritization_param.get("default_profile", None)
 6761        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
 6762        prioritization_score_mode = prioritization_param.get(
 6763            "prioritization_score_mode", "HOWARD"
 6764        )
 6765
 6766        # Quick Prioritizations
 6767        prioritizations = param.get("prioritizations", None)
 6768        if prioritizations:
 6769            log.info("Quick Prioritization:")
 6770            for profile in prioritizations.split(","):
 6771                if profile not in profiles:
 6772                    profiles.append(profile)
 6773                    log.info(f"   {profile}")
 6774
 6775        # If profile "ALL" provided, all profiles in the config profiles
 6776        if "ALL" in profiles:
 6777            profiles = list(prioritizations_config.keys())
 6778
 6779        for profile in profiles:
 6780            if prioritizations_config.get(profile, None):
 6781                log.debug(f"Profile '{profile}' configured")
 6782            else:
 6783                msg_error = f"Profile '{profile}' NOT configured"
 6784                log.error(msg_error)
 6785                raise ValueError(msg_error)
 6786
 6787        if profiles:
 6788            log.info(f"Prioritization... ")
 6789        else:
 6790            log.debug(f"No profile defined")
 6791            return False
 6792
 6793        if not default_profile and len(profiles):
 6794            default_profile = profiles[0]
 6795
 6796        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
 6797        log.debug("Profiles to check: " + str(list(profiles)))
 6798
 6799        # Variables
 6800        if table is not None:
 6801            table_variants = table
 6802        else:
 6803            table_variants = self.get_table_variants(clause="update")
 6804        log.debug(f"Table to prioritize: {table_variants}")
 6805
 6806        # Added columns
 6807        added_columns = []
 6808
 6809        # Create list of PZfields
 6810        # List of PZFields
 6811        list_of_pzfields_original = pzfields + [
 6812            pzfield + pzfields_sep + profile
 6813            for pzfield in pzfields
 6814            for profile in profiles
 6815        ]
 6816        list_of_pzfields = []
 6817        log.debug(f"{list_of_pzfields_original}")
 6818
 6819        # Remove existing PZfields to use if exists
 6820        for pzfield in list_of_pzfields_original:
 6821            if self.get_header().infos.get(pzfield, None) is None:
 6822                list_of_pzfields.append(pzfield)
 6823                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
 6824            else:
 6825                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
 6826
 6827        if list_of_pzfields:
 6828
 6829            # Explode Infos prefix
 6830            explode_infos_prefix = self.get_explode_infos_prefix()
 6831
 6832            # PZfields tags description
 6833            PZfields_INFOS = {
 6834                f"{pz_prefix}Tags": {
 6835                    "ID": f"{pz_prefix}Tags",
 6836                    "Number": ".",
 6837                    "Type": "String",
 6838                    "Description": "Variant tags based on annotation criteria",
 6839                },
 6840                f"{pz_prefix}Score": {
 6841                    "ID": f"{pz_prefix}Score",
 6842                    "Number": 1,
 6843                    "Type": "Integer",
 6844                    "Description": "Variant score based on annotation criteria",
 6845                },
 6846                f"{pz_prefix}Flag": {
 6847                    "ID": f"{pz_prefix}Flag",
 6848                    "Number": 1,
 6849                    "Type": "String",
 6850                    "Description": "Variant flag based on annotation criteria",
 6851                },
 6852                f"{pz_prefix}Comment": {
 6853                    "ID": f"{pz_prefix}Comment",
 6854                    "Number": ".",
 6855                    "Type": "String",
 6856                    "Description": "Variant comment based on annotation criteria",
 6857                },
 6858                f"{pz_prefix}Infos": {
 6859                    "ID": f"{pz_prefix}Infos",
 6860                    "Number": ".",
 6861                    "Type": "String",
 6862                    "Description": "Variant infos based on annotation criteria",
 6863                },
 6864                f"{pz_prefix}Class": {
 6865                    "ID": f"{pz_prefix}Class",
 6866                    "Number": ".",
 6867                    "Type": "String",
 6868                    "Description": "Variant class based on annotation criteria",
 6869                },
 6870            }
 6871
 6872            # Create INFO fields if not exist
 6873            for field in PZfields_INFOS:
 6874                field_ID = PZfields_INFOS[field]["ID"]
 6875                field_description = PZfields_INFOS[field]["Description"]
 6876                if field_ID not in self.get_header().infos and field_ID in pzfields:
 6877                    field_description = (
 6878                        PZfields_INFOS[field]["Description"]
 6879                        + f", profile {default_profile}"
 6880                    )
 6881                    self.get_header().infos[field_ID] = vcf.parser._Info(
 6882                        field_ID,
 6883                        PZfields_INFOS[field]["Number"],
 6884                        PZfields_INFOS[field]["Type"],
 6885                        field_description,
 6886                        "unknown",
 6887                        "unknown",
 6888                        code_type_map[PZfields_INFOS[field]["Type"]],
 6889                    )
 6890
 6891            # Create INFO fields if not exist for each profile
 6892            for profile in prioritizations_config:
 6893                if profile in profiles or profiles == []:
 6894                    for field in PZfields_INFOS:
 6895                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
 6896                        field_description = (
 6897                            PZfields_INFOS[field]["Description"]
 6898                            + f", profile {profile}"
 6899                        )
 6900                        if (
 6901                            field_ID not in self.get_header().infos
 6902                            and field in pzfields
 6903                        ):
 6904                            self.get_header().infos[field_ID] = vcf.parser._Info(
 6905                                field_ID,
 6906                                PZfields_INFOS[field]["Number"],
 6907                                PZfields_INFOS[field]["Type"],
 6908                                field_description,
 6909                                "unknown",
 6910                                "unknown",
 6911                                code_type_map[PZfields_INFOS[field]["Type"]],
 6912                            )
 6913
 6914            # Header
 6915            for pzfield in list_of_pzfields:
 6916                if re.match(f"{pz_prefix}Score.*", pzfield):
 6917                    added_column = self.add_column(
 6918                        table_name=table_variants,
 6919                        column_name=pzfield,
 6920                        column_type="INTEGER",
 6921                        default_value="0",
 6922                    )
 6923                elif re.match(f"{pz_prefix}Flag.*", pzfield):
 6924                    added_column = self.add_column(
 6925                        table_name=table_variants,
 6926                        column_name=pzfield,
 6927                        column_type="BOOLEAN",
 6928                        default_value="1",
 6929                    )
 6930                elif re.match(f"{pz_prefix}Class.*", pzfield):
 6931                    added_column = self.add_column(
 6932                        table_name=table_variants,
 6933                        column_name=pzfield,
 6934                        column_type="VARCHAR[]",
 6935                        default_value="null",
 6936                    )
 6937                else:
 6938                    added_column = self.add_column(
 6939                        table_name=table_variants,
 6940                        column_name=pzfield,
 6941                        column_type="STRING",
 6942                        default_value="''",
 6943                    )
 6944                added_columns.append(added_column)
 6945
 6946            # Profiles
 6947            if profiles:
 6948
 6949                # foreach profile in configuration file
 6950                for profile in prioritizations_config:
 6951
 6952                    # If profile is asked in param, or ALL are asked (empty profile [])
 6953                    if profile in profiles or profiles == []:
 6954                        log.info(f"Profile '{profile}'")
 6955
 6956                        sql_set_info_option = ""
 6957
 6958                        sql_set_info = []
 6959
 6960                        # PZ fields set
 6961
 6962                        # PZScore
 6963                        if (
 6964                            f"{pz_prefix}Score{pzfields_sep}{profile}"
 6965                            in list_of_pzfields
 6966                        ):
 6967                            sql_set_info.append(
 6968                                f"""
 6969                                    concat(
 6970                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
 6971                                        {pz_prefix}Score{pzfields_sep}{profile}
 6972                                    ) 
 6973                                """
 6974                            )
 6975                            if (
 6976                                profile == default_profile
 6977                                and f"{pz_prefix}Score" in list_of_pzfields
 6978                            ):
 6979                                sql_set_info.append(
 6980                                    f"""
 6981                                        concat(
 6982                                            '{pz_prefix}Score=',
 6983                                            {pz_prefix}Score{pzfields_sep}{profile}
 6984                                        )
 6985                                    """
 6986                                )
 6987
 6988                        # PZFlag
 6989                        if (
 6990                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
 6991                            in list_of_pzfields
 6992                        ):
 6993                            sql_set_info.append(
 6994                                f"""
 6995                                    concat(
 6996                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
 6997                                        CASE 
 6998                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 6999                                            THEN 'PASS'
 7000                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7001                                            THEN 'FILTERED'
 7002                                        END
 7003                                    ) 
 7004                                """
 7005                            )
 7006                            if (
 7007                                profile == default_profile
 7008                                and f"{pz_prefix}Flag" in list_of_pzfields
 7009                            ):
 7010                                sql_set_info.append(
 7011                                    f"""
 7012                                        concat(
 7013                                            '{pz_prefix}Flag=',
 7014                                            CASE 
 7015                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
 7016                                                THEN 'PASS'
 7017                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
 7018                                                THEN 'FILTERED'
 7019                                            END
 7020                                        )
 7021                                    """
 7022                                )
 7023
 7024                        # PZClass
 7025                        if (
 7026                            f"{pz_prefix}Class{pzfields_sep}{profile}"
 7027                            in list_of_pzfields
 7028                        ):
 7029                            sql_set_info.append(
 7030                                f"""
 7031                                    concat(
 7032                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
 7033                                        CASE
 7034                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7035                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7036                                            ELSE '.'
 7037                                        END 
 7038                                    )
 7039                                    
 7040                                """
 7041                            )
 7042                            if (
 7043                                profile == default_profile
 7044                                and f"{pz_prefix}Class" in list_of_pzfields
 7045                            ):
 7046                                sql_set_info.append(
 7047                                    f"""
 7048                                        concat(
 7049                                            '{pz_prefix}Class=',
 7050                                            CASE
 7051                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7052                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7053                                                ELSE '.'
 7054                                            END 
 7055                                        )
 7056                                    """
 7057                                )
 7058
 7059                        # PZComment
 7060                        if (
 7061                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7062                            in list_of_pzfields
 7063                        ):
 7064                            sql_set_info.append(
 7065                                f"""
 7066                                    CASE
 7067                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7068                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
 7069                                        ELSE ''
 7070                                    END
 7071                                """
 7072                            )
 7073                            if (
 7074                                profile == default_profile
 7075                                and f"{pz_prefix}Comment" in list_of_pzfields
 7076                            ):
 7077                                sql_set_info.append(
 7078                                    f"""
 7079                                        CASE
 7080                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
 7081                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
 7082                                            ELSE ''
 7083                                        END
 7084                                    """
 7085                                )
 7086
 7087                        # PZInfos
 7088                        if (
 7089                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7090                            in list_of_pzfields
 7091                        ):
 7092                            sql_set_info.append(
 7093                                f"""
 7094                                    CASE
 7095                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7096                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
 7097                                        ELSE ''
 7098                                    END
 7099                                """
 7100                            )
 7101                            if (
 7102                                profile == default_profile
 7103                                and f"{pz_prefix}Infos" in list_of_pzfields
 7104                            ):
 7105                                sql_set_info.append(
 7106                                    f"""
 7107                                        CASE
 7108                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
 7109                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
 7110                                            ELSE ''
 7111                                        END
 7112                                    """
 7113                                )
 7114
 7115                        # Merge PZfields
 7116                        sql_set_info_option = ""
 7117                        sql_set_sep = ""
 7118                        for sql_set in sql_set_info:
 7119                            if sql_set_sep:
 7120                                sql_set_info_option += f"""
 7121                                    , concat('{sql_set_sep}', {sql_set})
 7122                                """
 7123                            else:
 7124                                sql_set_info_option += f"""
 7125                                    , {sql_set}
 7126                                """
 7127                            sql_set_sep = ";"
 7128
 7129                        sql_queries = []
 7130                        for annotation in prioritizations_config[profile]:
 7131
 7132                            # skip special sections
 7133                            if annotation.startswith("_"):
 7134                                continue
 7135
 7136                            # For each criterions
 7137                            for criterion in prioritizations_config[profile][
 7138                                annotation
 7139                            ]:
 7140
 7141                                # Criterion mode
 7142                                criterion_mode = None
 7143                                if np.any(
 7144                                    np.isin(list(criterion.keys()), ["type", "value"])
 7145                                ):
 7146                                    criterion_mode = "operation"
 7147                                elif np.any(
 7148                                    np.isin(list(criterion.keys()), ["sql", "fields"])
 7149                                ):
 7150                                    criterion_mode = "sql"
 7151                                log.debug(f"Criterion Mode: {criterion_mode}")
 7152
 7153                                # Criterion parameters
 7154                                criterion_type = criterion.get("type", None)
 7155                                criterion_value = criterion.get("value", None)
 7156                                criterion_sql = criterion.get("sql", None)
 7157                                criterion_fields = criterion.get("fields", None)
 7158                                criterion_score = criterion.get("score", 0)
 7159                                criterion_flag = criterion.get("flag", "PASS")
 7160                                criterion_class = criterion.get("class", None)
 7161                                criterion_flag_bool = criterion_flag == "PASS"
 7162                                criterion_comment = (
 7163                                    ", ".join(criterion.get("comment", []))
 7164                                    .replace("'", "''")
 7165                                    .replace(";", ",")
 7166                                    .replace("\t", " ")
 7167                                )
 7168                                criterion_infos = (
 7169                                    str(criterion)
 7170                                    .replace("'", "''")
 7171                                    .replace(";", ",")
 7172                                    .replace("\t", " ")
 7173                                )
 7174
 7175                                # SQL
 7176                                if criterion_sql is not None and isinstance(
 7177                                    criterion_sql, list
 7178                                ):
 7179                                    criterion_sql = " ".join(criterion_sql)
 7180
 7181                                # Fields and explode
 7182                                if criterion_fields is None:
 7183                                    criterion_fields = [annotation]
 7184                                if not isinstance(criterion_fields, list):
 7185                                    criterion_fields = str(criterion_fields).split(",")
 7186
 7187                                # Class
 7188                                if criterion_class is not None and not isinstance(
 7189                                    criterion_class, list
 7190                                ):
 7191                                    criterion_class = str(criterion_class).split(",")
 7192
 7193                                for annotation_field in criterion_fields:
 7194
 7195                                    # Explode specific annotation
 7196                                    log.debug(
 7197                                        f"Explode annotation '{annotation_field}'"
 7198                                    )
 7199                                    added_columns += self.explode_infos(
 7200                                        prefix=explode_infos_prefix,
 7201                                        fields=[annotation_field],
 7202                                        table=table_variants,
 7203                                    )
 7204                                    extra_infos = self.get_extra_infos(
 7205                                        table=table_variants
 7206                                    )
 7207
 7208                                    # Check if annotation field is present
 7209                                    if (
 7210                                        f"{explode_infos_prefix}{annotation_field}"
 7211                                        not in extra_infos
 7212                                    ):
 7213                                        msq_err = f"Annotation '{annotation_field}' not in data"
 7214                                        log.error(msq_err)
 7215                                        raise ValueError(msq_err)
 7216                                    else:
 7217                                        log.debug(
 7218                                            f"Annotation '{annotation_field}' in data"
 7219                                        )
 7220
 7221                                sql_set = []
 7222                                sql_set_info = []
 7223
 7224                                # PZ fields set
 7225
 7226                                # PZScore
 7227                                if (
 7228                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
 7229                                    in list_of_pzfields
 7230                                ):
 7231                                    # if prioritization_score_mode == "HOWARD":
 7232                                    #     sql_set.append(
 7233                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7234                                    #     )
 7235                                    # VaRank prioritization score mode
 7236                                    if prioritization_score_mode == "VaRank":
 7237                                        sql_set.append(
 7238                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
 7239                                        )
 7240                                    # default HOWARD prioritization score mode
 7241                                    else:
 7242                                        sql_set.append(
 7243                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
 7244                                        )
 7245
 7246                                # PZFlag
 7247                                if (
 7248                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
 7249                                    in list_of_pzfields
 7250                                ):
 7251                                    sql_set.append(
 7252                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
 7253                                    )
 7254
 7255                                # PZClass
 7256                                if (
 7257                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
 7258                                    in list_of_pzfields
 7259                                    and criterion_class is not None
 7260                                ):
 7261                                    sql_set.append(
 7262                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
 7263                                    )
 7264
 7265                                # PZComment
 7266                                if (
 7267                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
 7268                                    in list_of_pzfields
 7269                                ):
 7270                                    sql_set.append(
 7271                                        f"""
 7272                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
 7273                                                concat(
 7274                                                    {pz_prefix}Comment{pzfields_sep}{profile},
 7275                                                    CASE 
 7276                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
 7277                                                        THEN ', '
 7278                                                        ELSE ''
 7279                                                    END,
 7280                                                    '{criterion_comment}'
 7281                                                )
 7282                                        """
 7283                                    )
 7284
 7285                                # PZInfos
 7286                                if (
 7287                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
 7288                                    in list_of_pzfields
 7289                                ):
 7290                                    sql_set.append(
 7291                                        f"""
 7292                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
 7293                                                concat(
 7294                                                    {pz_prefix}Infos{pzfields_sep}{profile},
 7295                                                    '{criterion_infos}'
 7296                                                )
 7297                                        """
 7298                                    )
 7299                                sql_set_option = ",".join(sql_set)
 7300
 7301                                # Criterion and comparison
 7302                                if sql_set_option:
 7303
 7304                                    if criterion_mode in ["operation"]:
 7305
 7306                                        try:
 7307                                            float(criterion_value)
 7308                                            sql_update = f"""
 7309                                                UPDATE {table_variants}
 7310                                                SET {sql_set_option}
 7311                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
 7312                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
 7313                                            """
 7314                                        except:
 7315                                            contains_option = ""
 7316                                            if criterion_type == "contains":
 7317                                                contains_option = ".*"
 7318                                            sql_update = f"""
 7319                                                UPDATE {table_variants}
 7320                                                SET {sql_set_option}
 7321                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
 7322                                            """
 7323                                        sql_queries.append(sql_update)
 7324
 7325                                    elif criterion_mode in ["sql"]:
 7326
 7327                                        sql_update = f"""
 7328                                            UPDATE {table_variants}
 7329                                            SET {sql_set_option}
 7330                                            WHERE {criterion_sql}
 7331                                        """
 7332                                        sql_queries.append(sql_update)
 7333
 7334                                    else:
 7335                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
 7336                                        log.error(msg_err)
 7337                                        raise ValueError(msg_err)
 7338
 7339                                else:
 7340                                    log.warning(
 7341                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
 7342                                    )
 7343
 7344                        # PZTags
 7345                        if (
 7346                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
 7347                            in list_of_pzfields
 7348                        ):
 7349
 7350                            # Create PZFalgs value
 7351                            pztags_value = ""
 7352                            pztags_sep_default = ","
 7353                            pztags_sep = ""
 7354                            for pzfield in pzfields:
 7355                                if pzfield not in [f"{pz_prefix}Tags"]:
 7356                                    if (
 7357                                        f"{pzfield}{pzfields_sep}{profile}"
 7358                                        in list_of_pzfields
 7359                                    ):
 7360                                        if pzfield in [f"{pz_prefix}Flag"]:
 7361                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7362                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
 7363                                                    THEN 'PASS'
 7364                                                    ELSE 'FILTERED'
 7365                                                END, '"""
 7366                                        elif pzfield in [f"{pz_prefix}Class"]:
 7367                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
 7368                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
 7369                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
 7370                                                    ELSE '.'
 7371                                                END, '"""
 7372                                        else:
 7373                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
 7374                                        pztags_sep = pztags_sep_default
 7375
 7376                            # Add Query update for PZFlags
 7377                            sql_update_pztags = f"""
 7378                                UPDATE {table_variants}
 7379                                SET INFO = concat(
 7380                                        INFO,
 7381                                        CASE WHEN INFO NOT in ('','.')
 7382                                                THEN ';'
 7383                                                ELSE ''
 7384                                        END,
 7385                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
 7386                                    )
 7387                                """
 7388                            sql_queries.append(sql_update_pztags)
 7389
 7390                            # Add Query update for PZFlags for default
 7391                            if profile == default_profile:
 7392                                sql_update_pztags_default = f"""
 7393                                UPDATE {table_variants}
 7394                                SET INFO = concat(
 7395                                        INFO,
 7396                                        ';',
 7397                                        '{pz_prefix}Tags={pztags_value}'
 7398                                    )
 7399                                """
 7400                                sql_queries.append(sql_update_pztags_default)
 7401
 7402                        log.info(f"""Profile '{profile}' - Prioritization... """)
 7403
 7404                        if sql_queries:
 7405
 7406                            for sql_query in sql_queries:
 7407                                log.debug(
 7408                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
 7409                                )
 7410                                self.conn.execute(sql_query)
 7411
 7412                        log.info(f"""Profile '{profile}' - Update... """)
 7413                        sql_query_update = f"""
 7414                            UPDATE {table_variants}
 7415                            SET INFO =  
 7416                                concat(
 7417                                    CASE
 7418                                        WHEN INFO NOT IN ('','.')
 7419                                        THEN concat(INFO, ';')
 7420                                        ELSE ''
 7421                                    END
 7422                                    {sql_set_info_option}
 7423                                )
 7424                        """
 7425                        self.conn.execute(sql_query_update)
 7426
 7427        else:
 7428
 7429            log.warning(f"No profiles in parameters")
 7430
 7431        # Remove added columns
 7432        for added_column in added_columns:
 7433            self.drop_column(column=added_column)
 7434
 7435        # Explode INFOS fields into table fields
 7436        if self.get_explode_infos():
 7437            self.explode_infos(
 7438                prefix=self.get_explode_infos_prefix(),
 7439                fields=self.get_explode_infos_fields(),
 7440                force=True,
 7441            )
 7442
 7443        return True
 7444
 7445    ###
 7446    # HGVS
 7447    ###
 7448
 7449    def annotation_hgvs(self, threads: int = None) -> None:
 7450        """
 7451        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
 7452        coordinates and alleles.
 7453
 7454        :param threads: The `threads` parameter is an optional integer that specifies the number of
 7455        threads to use for parallel processing. If no value is provided, it will default to the number
 7456        of threads obtained from the `get_threads()` method
 7457        :type threads: int
 7458        """
 7459
 7460        # Function for each partition of the Dask Dataframe
 7461        def partition_function(partition):
 7462            """
 7463            The function `partition_function` applies the `annotation_hgvs_partition` function to
 7464            each row of a DataFrame called `partition`.
 7465
 7466            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
 7467            to be processed
 7468            :return: the result of applying the "annotation_hgvs_partition" function to each row of
 7469            the "partition" dataframe along the axis 1.
 7470            """
 7471            return partition.apply(annotation_hgvs_partition, axis=1)
 7472
 7473        def annotation_hgvs_partition(row) -> str:
 7474            """
 7475            The function `annotation_hgvs_partition` takes in a row of data and returns a string
 7476            containing a list of HGVS names associated with the given genomic coordinates and alleles.
 7477
 7478            :param row: A dictionary-like object that contains the values for the following keys:
 7479            :return: a string that contains the HGVS names associated with the given row of data.
 7480            """
 7481
 7482            chr = row["CHROM"]
 7483            pos = row["POS"]
 7484            ref = row["REF"]
 7485            alt = row["ALT"]
 7486
 7487            # Find list of associated transcripts
 7488            transcripts_list = list(
 7489                polars_conn.execute(
 7490                    f"""
 7491                SELECT transcript
 7492                FROM refseq_df
 7493                WHERE CHROM='{chr}'
 7494                AND POS={pos}
 7495            """
 7496                )["transcript"]
 7497            )
 7498
 7499            # Full HGVS annotation in list
 7500            hgvs_full_list = []
 7501
 7502            for transcript_name in transcripts_list:
 7503
 7504                # Transcript
 7505                transcript = get_transcript(
 7506                    transcripts=transcripts, transcript_name=transcript_name
 7507                )
 7508                # Exon
 7509                if use_exon:
 7510                    exon = transcript.find_exon_number(pos)
 7511                else:
 7512                    exon = None
 7513                # Protein
 7514                transcript_protein = None
 7515                if use_protein or add_protein or full_format:
 7516                    transcripts_protein = list(
 7517                        polars_conn.execute(
 7518                            f"""
 7519                        SELECT protein
 7520                        FROM refseqlink_df
 7521                        WHERE transcript='{transcript_name}'
 7522                        LIMIT 1
 7523                    """
 7524                        )["protein"]
 7525                    )
 7526                    if len(transcripts_protein):
 7527                        transcript_protein = transcripts_protein[0]
 7528
 7529                # HGVS name
 7530                hgvs_name = format_hgvs_name(
 7531                    chr,
 7532                    pos,
 7533                    ref,
 7534                    alt,
 7535                    genome=genome,
 7536                    transcript=transcript,
 7537                    transcript_protein=transcript_protein,
 7538                    exon=exon,
 7539                    use_gene=use_gene,
 7540                    use_protein=use_protein,
 7541                    full_format=full_format,
 7542                    use_version=use_version,
 7543                    codon_type=codon_type,
 7544                )
 7545                hgvs_full_list.append(hgvs_name)
 7546                if add_protein and not use_protein and not full_format:
 7547                    hgvs_name = format_hgvs_name(
 7548                        chr,
 7549                        pos,
 7550                        ref,
 7551                        alt,
 7552                        genome=genome,
 7553                        transcript=transcript,
 7554                        transcript_protein=transcript_protein,
 7555                        exon=exon,
 7556                        use_gene=use_gene,
 7557                        use_protein=True,
 7558                        full_format=False,
 7559                        use_version=use_version,
 7560                        codon_type=codon_type,
 7561                    )
 7562                    hgvs_full_list.append(hgvs_name)
 7563
 7564            # Create liste of HGVS annotations
 7565            hgvs_full = ",".join(hgvs_full_list)
 7566
 7567            return hgvs_full
 7568
 7569        # Polars connexion
 7570        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7571
 7572        # Config
 7573        config = self.get_config()
 7574
 7575        # Databases
 7576        # Genome
 7577        databases_genomes_folders = (
 7578            config.get("folders", {})
 7579            .get("databases", {})
 7580            .get("genomes", DEFAULT_GENOME_FOLDER)
 7581        )
 7582        databases_genome = (
 7583            config.get("folders", {}).get("databases", {}).get("genomes", "")
 7584        )
 7585        # refseq database folder
 7586        databases_refseq_folders = (
 7587            config.get("folders", {})
 7588            .get("databases", {})
 7589            .get("refseq", DEFAULT_REFSEQ_FOLDER)
 7590        )
 7591        # refseq
 7592        databases_refseq = config.get("databases", {}).get("refSeq", None)
 7593        # refSeqLink
 7594        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
 7595
 7596        # Param
 7597        param = self.get_param()
 7598
 7599        # Quick HGVS
 7600        if "hgvs_options" in param and param.get("hgvs_options", ""):
 7601            log.info(f"Quick HGVS Annotation:")
 7602            if not param.get("hgvs", None):
 7603                param["hgvs"] = {}
 7604            for option in param.get("hgvs_options", "").split(","):
 7605                option_var_val = option.split("=")
 7606                option_var = option_var_val[0]
 7607                if len(option_var_val) > 1:
 7608                    option_val = option_var_val[1]
 7609                else:
 7610                    option_val = "True"
 7611                if option_val.upper() in ["TRUE"]:
 7612                    option_val = True
 7613                elif option_val.upper() in ["FALSE"]:
 7614                    option_val = False
 7615                log.info(f"   {option_var}={option_val}")
 7616                param["hgvs"][option_var] = option_val
 7617
 7618        # Check if HGVS annotation enabled
 7619        if "hgvs" in param:
 7620            log.info(f"HGVS Annotation... ")
 7621            for hgvs_option in param.get("hgvs", {}):
 7622                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
 7623        else:
 7624            return
 7625
 7626        # HGVS Param
 7627        param_hgvs = param.get("hgvs", {})
 7628        use_exon = param_hgvs.get("use_exon", False)
 7629        use_gene = param_hgvs.get("use_gene", False)
 7630        use_protein = param_hgvs.get("use_protein", False)
 7631        add_protein = param_hgvs.get("add_protein", False)
 7632        full_format = param_hgvs.get("full_format", False)
 7633        use_version = param_hgvs.get("use_version", False)
 7634        codon_type = param_hgvs.get("codon_type", "3")
 7635
 7636        # refSseq refSeqLink
 7637        databases_refseq = param_hgvs.get("refseq", databases_refseq)
 7638        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
 7639
 7640        # Assembly
 7641        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
 7642
 7643        # Genome
 7644        genome_file = None
 7645        if find_genome(databases_genome):
 7646            genome_file = find_genome(databases_genome)
 7647        else:
 7648            genome_file = find_genome(
 7649                genome_path=databases_genomes_folders, assembly=assembly
 7650            )
 7651        log.debug("Genome: " + str(genome_file))
 7652
 7653        # refSseq
 7654        refseq_file = find_file_prefix(
 7655            input_file=databases_refseq,
 7656            prefix="ncbiRefSeq",
 7657            folder=databases_refseq_folders,
 7658            assembly=assembly,
 7659        )
 7660        log.debug("refSeq: " + str(refseq_file))
 7661
 7662        # refSeqLink
 7663        refseqlink_file = find_file_prefix(
 7664            input_file=databases_refseqlink,
 7665            prefix="ncbiRefSeqLink",
 7666            folder=databases_refseq_folders,
 7667            assembly=assembly,
 7668        )
 7669        log.debug("refSeqLink: " + str(refseqlink_file))
 7670
 7671        # Threads
 7672        if not threads:
 7673            threads = self.get_threads()
 7674        log.debug("Threads: " + str(threads))
 7675
 7676        # Variables
 7677        table_variants = self.get_table_variants(clause="update")
 7678
 7679        # Get variants SNV and InDel only
 7680        query_variants = f"""
 7681            SELECT "#CHROM" AS CHROM, POS, REF, ALT
 7682            FROM {table_variants}
 7683            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
 7684            """
 7685        df_variants = self.get_query_to_df(query_variants)
 7686
 7687        # Added columns
 7688        added_columns = []
 7689
 7690        # Add hgvs column in variants table
 7691        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
 7692        added_column = self.add_column(
 7693            table_variants, hgvs_column_name, "STRING", default_value=None
 7694        )
 7695        added_columns.append(added_column)
 7696
 7697        log.debug(f"refSeq loading...")
 7698        # refSeq in duckDB
 7699        refseq_table = get_refseq_table(
 7700            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
 7701        )
 7702        # Loading all refSeq in Dataframe
 7703        refseq_query = f"""
 7704            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
 7705            FROM {refseq_table}
 7706            JOIN df_variants ON (
 7707                {refseq_table}.chrom = df_variants.CHROM
 7708                AND {refseq_table}.txStart<=df_variants.POS
 7709                AND {refseq_table}.txEnd>=df_variants.POS
 7710            )
 7711        """
 7712        refseq_df = self.conn.query(refseq_query).pl()
 7713
 7714        if refseqlink_file:
 7715            log.debug(f"refSeqLink loading...")
 7716            # refSeqLink in duckDB
 7717            refseqlink_table = get_refseq_table(
 7718                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
 7719            )
 7720            # Loading all refSeqLink in Dataframe
 7721            protacc_column = "protAcc_with_ver"
 7722            mrnaacc_column = "mrnaAcc_with_ver"
 7723            refseqlink_query = f"""
 7724                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
 7725                FROM {refseqlink_table} 
 7726                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
 7727                WHERE protAcc_without_ver IS NOT NULL
 7728            """
 7729            # Polars Dataframe
 7730            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
 7731
 7732        # Read RefSeq transcripts into a python dict/model.
 7733        log.debug(f"Transcripts loading...")
 7734        with tempfile.TemporaryDirectory() as tmpdir:
 7735            transcripts_query = f"""
 7736                COPY (
 7737                    SELECT {refseq_table}.*
 7738                    FROM {refseq_table}
 7739                    JOIN df_variants ON (
 7740                        {refseq_table}.chrom=df_variants.CHROM
 7741                        AND {refseq_table}.txStart<=df_variants.POS
 7742                        AND {refseq_table}.txEnd>=df_variants.POS
 7743                    )
 7744                )
 7745                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
 7746            """
 7747            self.conn.query(transcripts_query)
 7748            with open(f"{tmpdir}/transcript.tsv") as infile:
 7749                transcripts = read_transcripts(infile)
 7750
 7751        # Polars connexion
 7752        polars_conn = pl.SQLContext(register_globals=True, eager=True)
 7753
 7754        log.debug("Genome loading...")
 7755        # Read genome sequence using pyfaidx.
 7756        genome = Fasta(genome_file)
 7757
 7758        log.debug("Start annotation HGVS...")
 7759
 7760        # Create
 7761        # a Dask Dataframe from Pandas dataframe with partition as number of threads
 7762        ddf = dd.from_pandas(df_variants, npartitions=threads)
 7763
 7764        # Use dask.dataframe.apply() to apply function on each partition
 7765        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
 7766
 7767        # Convert Dask DataFrame to Pandas Dataframe
 7768        df = ddf.compute()
 7769
 7770        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
 7771        with tempfile.TemporaryDirectory() as tmpdir:
 7772            df_parquet = os.path.join(tmpdir, "df.parquet")
 7773            df.to_parquet(df_parquet)
 7774
 7775            # Update hgvs column
 7776            update_variant_query = f"""
 7777                UPDATE {table_variants}
 7778                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
 7779                FROM read_parquet('{df_parquet}') as df
 7780                WHERE variants."#CHROM" = df.CHROM
 7781                AND variants.POS = df.POS
 7782                AND variants.REF = df.REF
 7783                AND variants.ALT = df.ALT
 7784                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
 7785                """
 7786            self.execute_query(update_variant_query)
 7787
 7788        # Update INFO column
 7789        sql_query_update = f"""
 7790            UPDATE {table_variants}
 7791            SET INFO = 
 7792                concat(
 7793                    CASE 
 7794                        WHEN INFO NOT IN ('','.')
 7795                        THEN concat(INFO, ';')
 7796                        ELSE ''
 7797                    END,
 7798                    'hgvs=',
 7799                    {hgvs_column_name}
 7800                )
 7801            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
 7802            """
 7803        self.execute_query(sql_query_update)
 7804
 7805        # Add header
 7806        HGVS_INFOS = {
 7807            "hgvs": {
 7808                "ID": "hgvs",
 7809                "Number": ".",
 7810                "Type": "String",
 7811                "Description": f"HGVS annotatation with HOWARD",
 7812            }
 7813        }
 7814
 7815        for field in HGVS_INFOS:
 7816            field_ID = HGVS_INFOS[field]["ID"]
 7817            field_description = HGVS_INFOS[field]["Description"]
 7818            self.get_header().infos[field_ID] = vcf.parser._Info(
 7819                field_ID,
 7820                HGVS_INFOS[field]["Number"],
 7821                HGVS_INFOS[field]["Type"],
 7822                field_description,
 7823                "unknown",
 7824                "unknown",
 7825                code_type_map[HGVS_INFOS[field]["Type"]],
 7826            )
 7827
 7828        # Remove added columns
 7829        for added_column in added_columns:
 7830            self.drop_column(column=added_column)
 7831
 7832    ###
 7833    # Calculation
 7834    ###
 7835
 7836    def get_operations_help(
 7837        self, operations_config_dict: dict = {}, operations_config_file: str = None
 7838    ) -> list:
 7839
 7840        # Init
 7841        operations_help = []
 7842
 7843        # operations
 7844        operations = self.get_config_json(
 7845            name="calculations",
 7846            config_dict=operations_config_dict,
 7847            config_file=operations_config_file,
 7848        )
 7849        for op in operations:
 7850            op_name = operations[op].get("name", op).upper()
 7851            op_description = operations[op].get("description", op_name)
 7852            op_available = operations[op].get("available", False)
 7853            if op_available:
 7854                operations_help.append(f"   {op_name}: {op_description}")
 7855
 7856        # Sort operations
 7857        operations_help.sort()
 7858
 7859        # insert header
 7860        operations_help.insert(0, "Available calculation operations:")
 7861
 7862        # Return
 7863        return operations_help
 7864
 7865    def calculation(
 7866        self,
 7867        operations: dict = {},
 7868        operations_config_dict: dict = {},
 7869        operations_config_file: str = None,
 7870    ) -> None:
 7871        """
 7872        It takes a list of operations, and for each operation, it checks if it's a python or sql
 7873        operation, and then calls the appropriate function
 7874
 7875        param json example:
 7876            "calculation": {
 7877                "NOMEN": {
 7878                    "options": {
 7879                        "hgvs_field": "hgvs"
 7880                    },
 7881                "middle" : null
 7882            }
 7883        """
 7884
 7885        # Param
 7886        param = self.get_param()
 7887
 7888        # operations config
 7889        operations_config = self.get_config_json(
 7890            name="calculations",
 7891            config_dict=operations_config_dict,
 7892            config_file=operations_config_file,
 7893        )
 7894
 7895        # Upper keys
 7896        operations_config = {k.upper(): v for k, v in operations_config.items()}
 7897
 7898        # Calculations
 7899
 7900        # Operations from param
 7901        operations = param.get("calculation", {}).get("calculations", operations)
 7902
 7903        # Quick calculation - add
 7904        if param.get("calculations", None):
 7905            calculations_list = [
 7906                value for value in param.get("calculations", "").split(",")
 7907            ]
 7908            log.info(f"Quick Calculations:")
 7909            for calculation_key in calculations_list:
 7910                log.info(f"   {calculation_key}")
 7911            for calculation_operation in calculations_list:
 7912                if calculation_operation.upper() not in operations:
 7913                    operations[calculation_operation.upper()] = {}
 7914                    add_value_into_dict(
 7915                        dict_tree=param,
 7916                        sections=[
 7917                            "calculation",
 7918                            "calculations",
 7919                            calculation_operation.upper(),
 7920                        ],
 7921                        value={},
 7922                    )
 7923
 7924        # Operations for calculation
 7925        if not operations:
 7926            operations = param.get("calculation", {}).get("calculations", {})
 7927
 7928        if operations:
 7929            log.info(f"Calculations...")
 7930
 7931        # For each operations
 7932        for operation_name in operations:
 7933            operation_name = operation_name.upper()
 7934            if operation_name not in [""]:
 7935                if operation_name in operations_config:
 7936                    log.info(f"Calculation '{operation_name}'")
 7937                    operation = operations_config[operation_name]
 7938                    operation_type = operation.get("type", "sql")
 7939                    if operation_type == "python":
 7940                        self.calculation_process_function(
 7941                            operation=operation, operation_name=operation_name
 7942                        )
 7943                    elif operation_type == "sql":
 7944                        self.calculation_process_sql(
 7945                            operation=operation, operation_name=operation_name
 7946                        )
 7947                    else:
 7948                        log.error(
 7949                            f"Operations config: Type '{operation_type}' NOT available"
 7950                        )
 7951                        raise ValueError(
 7952                            f"Operations config: Type '{operation_type}' NOT available"
 7953                        )
 7954                else:
 7955                    log.error(
 7956                        f"Operations config: Calculation '{operation_name}' NOT available"
 7957                    )
 7958                    raise ValueError(
 7959                        f"Operations config: Calculation '{operation_name}' NOT available"
 7960                    )
 7961
 7962        # Explode INFOS fields into table fields
 7963        if self.get_explode_infos():
 7964            self.explode_infos(
 7965                prefix=self.get_explode_infos_prefix(),
 7966                fields=self.get_explode_infos_fields(),
 7967                force=True,
 7968            )
 7969
 7970    def calculation_process_sql(
 7971        self, operation: dict, operation_name: str = "unknown"
 7972    ) -> None:
 7973        """
 7974        The `calculation_process_sql` function takes in a mathematical operation as a string and
 7975        performs the operation, updating the specified table with the result.
 7976
 7977        :param operation: The `operation` parameter is a dictionary that contains information about the
 7978        mathematical operation to be performed. It includes the following keys:
 7979        :type operation: dict
 7980        :param operation_name: The `operation_name` parameter is a string that represents the name of
 7981        the mathematical operation being performed. It is used for logging and error handling purposes,
 7982        defaults to unknown
 7983        :type operation_name: str (optional)
 7984        """
 7985
 7986        # table variants
 7987        table_variants = self.get_table_variants(clause="alter")
 7988
 7989        # Operation infos
 7990        operation_name = operation.get("name", "unknown")
 7991        log.debug(f"process sql {operation_name}")
 7992        output_column_name = operation.get("output_column_name", operation_name)
 7993        output_column_type = operation.get("output_column_type", "String")
 7994        prefix = operation.get("explode_infos_prefix", "")
 7995        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
 7996        output_column_description = operation.get(
 7997            "output_column_description", f"{operation_name} operation"
 7998        )
 7999        operation_query = operation.get("operation_query", None)
 8000        if isinstance(operation_query, list):
 8001            operation_query = " ".join(operation_query)
 8002        operation_info_fields = operation.get("info_fields", [])
 8003        operation_info_fields_check = operation.get("info_fields_check", False)
 8004        operation_info = operation.get("operation_info", True)
 8005
 8006        if operation_query:
 8007
 8008            # Info fields check
 8009            operation_info_fields_check_result = True
 8010            if operation_info_fields_check:
 8011                header_infos = self.get_header().infos
 8012                for info_field in operation_info_fields:
 8013                    operation_info_fields_check_result = (
 8014                        operation_info_fields_check_result
 8015                        and info_field in header_infos
 8016                    )
 8017
 8018            # If info fields available
 8019            if operation_info_fields_check_result:
 8020
 8021                # Added_columns
 8022                added_columns = []
 8023
 8024                # Create VCF header field
 8025                vcf_reader = self.get_header()
 8026                vcf_reader.infos[output_column_name] = vcf.parser._Info(
 8027                    output_column_name,
 8028                    ".",
 8029                    output_column_type,
 8030                    output_column_description,
 8031                    "howard calculation",
 8032                    "0",
 8033                    self.code_type_map.get(output_column_type),
 8034                )
 8035
 8036                # Explode infos if needed
 8037                log.debug(f"calculation_process_sql prefix {prefix}")
 8038                added_columns += self.explode_infos(
 8039                    prefix=prefix,
 8040                    fields=[output_column_name] + operation_info_fields,
 8041                    force=True,
 8042                )
 8043
 8044                # Create column
 8045                added_column = self.add_column(
 8046                    table_name=table_variants,
 8047                    column_name=prefix + output_column_name,
 8048                    column_type=output_column_type_sql,
 8049                    default_value="null",
 8050                )
 8051                added_columns.append(added_column)
 8052
 8053                # Operation calculation
 8054                try:
 8055
 8056                    # Query to update calculation column
 8057                    sql_update = f"""
 8058                        UPDATE {table_variants}
 8059                        SET "{prefix}{output_column_name}" = ({operation_query})
 8060                    """
 8061                    self.conn.execute(sql_update)
 8062
 8063                    # Add to INFO
 8064                    if operation_info:
 8065                        sql_update_info = f"""
 8066                            UPDATE {table_variants}
 8067                            SET "INFO" =
 8068                                concat(
 8069                                    CASE
 8070                                        WHEN "INFO" IS NOT NULL
 8071                                        THEN concat("INFO", ';')
 8072                                        ELSE ''
 8073                                    END,
 8074                                    '{output_column_name}=',
 8075                                    "{prefix}{output_column_name}"
 8076                                )
 8077                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
 8078                        """
 8079                        self.conn.execute(sql_update_info)
 8080
 8081                except:
 8082                    log.error(
 8083                        f"Operations config: Calculation '{operation_name}' query failed"
 8084                    )
 8085                    raise ValueError(
 8086                        f"Operations config: Calculation '{operation_name}' query failed"
 8087                    )
 8088
 8089                # Remove added columns
 8090                for added_column in added_columns:
 8091                    log.debug(f"added_column: {added_column}")
 8092                    self.drop_column(column=added_column)
 8093
 8094            else:
 8095                log.error(
 8096                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8097                )
 8098                raise ValueError(
 8099                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
 8100                )
 8101
 8102        else:
 8103            log.error(
 8104                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8105            )
 8106            raise ValueError(
 8107                f"Operations config: Calculation '{operation_name}' query NOT defined"
 8108            )
 8109
 8110    def calculation_process_function(
 8111        self, operation: dict, operation_name: str = "unknown"
 8112    ) -> None:
 8113        """
 8114        The `calculation_process_function` takes in an operation dictionary and performs the specified
 8115        function with the given parameters.
 8116
 8117        :param operation: The `operation` parameter is a dictionary that contains information about the
 8118        operation to be performed. It has the following keys:
 8119        :type operation: dict
 8120        :param operation_name: The `operation_name` parameter is a string that represents the name of
 8121        the operation being performed. It is used for logging purposes, defaults to unknown
 8122        :type operation_name: str (optional)
 8123        """
 8124
 8125        operation_name = operation["name"]
 8126        log.debug(f"process sql {operation_name}")
 8127        function_name = operation["function_name"]
 8128        function_params = operation["function_params"]
 8129        getattr(self, function_name)(*function_params)
 8130
 8131    def calculation_variant_id(self) -> None:
 8132        """
 8133        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
 8134        updates the INFO field of a variants table with the variant ID.
 8135        """
 8136
 8137        # variant_id annotation field
 8138        variant_id_tag = self.get_variant_id_column()
 8139        added_columns = [variant_id_tag]
 8140
 8141        # variant_id hgvs tags"
 8142        vcf_infos_tags = {
 8143            variant_id_tag: "howard variant ID annotation",
 8144        }
 8145
 8146        # Variants table
 8147        table_variants = self.get_table_variants()
 8148
 8149        # Header
 8150        vcf_reader = self.get_header()
 8151
 8152        # Add variant_id to header
 8153        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
 8154            variant_id_tag,
 8155            ".",
 8156            "String",
 8157            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
 8158            "howard calculation",
 8159            "0",
 8160            self.code_type_map.get("String"),
 8161        )
 8162
 8163        # Update
 8164        sql_update = f"""
 8165            UPDATE {table_variants}
 8166            SET "INFO" = 
 8167                concat(
 8168                    CASE
 8169                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8170                        THEN ''
 8171                        ELSE concat("INFO", ';')
 8172                    END,
 8173                    '{variant_id_tag}=',
 8174                    "{variant_id_tag}"
 8175                )
 8176        """
 8177        self.conn.execute(sql_update)
 8178
 8179        # Remove added columns
 8180        for added_column in added_columns:
 8181            self.drop_column(column=added_column)
 8182
 8183    def calculation_extract_snpeff_hgvs(
 8184        self,
 8185        snpeff_hgvs: str = "snpeff_hgvs",
 8186        snpeff_field: str = "ANN",
 8187    ) -> None:
 8188        """
 8189        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
 8190        annotation field in a VCF file and adds them as a new column in the variants table.
 8191
 8192        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
 8193        function is used to specify the name of the column that will store the HGVS nomenclatures
 8194        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
 8195        snpeff_hgvs
 8196        :type snpeff_hgvs: str (optional)
 8197        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
 8198        function represents the field in the VCF file that contains SnpEff annotations. This field is
 8199        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
 8200        to ANN
 8201        :type snpeff_field: str (optional)
 8202        """
 8203
 8204        # Snpeff hgvs tags
 8205        vcf_infos_tags = {
 8206            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
 8207        }
 8208
 8209        # Prefix
 8210        prefix = self.get_explode_infos_prefix()
 8211        if prefix:
 8212            prefix = "INFO/"
 8213
 8214        # snpEff fields
 8215        speff_ann_infos = prefix + snpeff_field
 8216        speff_hgvs_infos = prefix + snpeff_hgvs
 8217
 8218        # Variants table
 8219        table_variants = self.get_table_variants()
 8220
 8221        # Header
 8222        vcf_reader = self.get_header()
 8223
 8224        # Add columns
 8225        added_columns = []
 8226
 8227        # Explode HGVS field in column
 8228        added_columns += self.explode_infos(fields=[snpeff_field])
 8229
 8230        if snpeff_field in vcf_reader.infos:
 8231
 8232            log.debug(vcf_reader.infos[snpeff_field])
 8233
 8234            # Extract ANN header
 8235            ann_description = vcf_reader.infos[snpeff_field].desc
 8236            pattern = r"'(.+?)'"
 8237            match = re.search(pattern, ann_description)
 8238            if match:
 8239                ann_header_match = match.group(1).split(" | ")
 8240                ann_header_desc = {}
 8241                for i in range(len(ann_header_match)):
 8242                    ann_header_info = "".join(
 8243                        char for char in ann_header_match[i] if char.isalnum()
 8244                    )
 8245                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8246                if not ann_header_desc:
 8247                    raise ValueError("Invalid header description format")
 8248            else:
 8249                raise ValueError("Invalid header description format")
 8250
 8251            # Create variant id
 8252            variant_id_column = self.get_variant_id_column()
 8253            added_columns += [variant_id_column]
 8254
 8255            # Create dataframe
 8256            dataframe_snpeff_hgvs = self.get_query_to_df(
 8257                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8258            )
 8259
 8260            # Create main NOMEN column
 8261            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8262                speff_ann_infos
 8263            ].apply(
 8264                lambda x: extract_snpeff_hgvs(
 8265                    str(x), header=list(ann_header_desc.values())
 8266                )
 8267            )
 8268
 8269            # Add snpeff_hgvs to header
 8270            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
 8271                snpeff_hgvs,
 8272                ".",
 8273                "String",
 8274                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
 8275                "howard calculation",
 8276                "0",
 8277                self.code_type_map.get("String"),
 8278            )
 8279
 8280            # Update
 8281            sql_update = f"""
 8282                UPDATE variants
 8283                SET "INFO" = 
 8284                    concat(
 8285                        CASE
 8286                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8287                            THEN ''
 8288                            ELSE concat("INFO", ';')
 8289                        END,
 8290                        CASE 
 8291                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8292                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8293                            THEN concat(
 8294                                    '{snpeff_hgvs}=',
 8295                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8296                                )
 8297                            ELSE ''
 8298                        END
 8299                    )
 8300                FROM dataframe_snpeff_hgvs
 8301                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8302
 8303            """
 8304            self.conn.execute(sql_update)
 8305
 8306            # Delete dataframe
 8307            del dataframe_snpeff_hgvs
 8308            gc.collect()
 8309
 8310        else:
 8311
 8312            log.warning(
 8313                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8314            )
 8315
 8316        # Remove added columns
 8317        for added_column in added_columns:
 8318            self.drop_column(column=added_column)
 8319
 8320    def calculation_snpeff_ann_explode(
 8321        self,
 8322        uniquify: bool = True,
 8323        output_format: str = "fields",
 8324        output_prefix: str = "snpeff_",
 8325        snpeff_field: str = "ANN",
 8326    ) -> None:
 8327        """
 8328        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
 8329        exploding the HGVS field and updating variant information accordingly.
 8330
 8331        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
 8332        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
 8333        it indicates that the output should be unique, meaning that duplicate entries should be removed,
 8334        defaults to True
 8335        :type uniquify: bool (optional)
 8336        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
 8337        function specifies the format in which the output annotations will be generated. It has a
 8338        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
 8339        format, defaults to fields
 8340        :type output_format: str (optional)
 8341        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
 8342        method is used to specify the prefix that will be added to the output annotations generated
 8343        during the calculation process. This prefix helps to differentiate the newly added annotations
 8344        from existing ones in the output data. By default, the, defaults to ANN_
 8345        :type output_prefix: str (optional)
 8346        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
 8347        function is used to specify the field in the VCF file that contains SnpEff annotations. This
 8348        field will be processed to explode the HGVS annotations and update the variant information
 8349        accordingly, defaults to ANN
 8350        :type snpeff_field: str (optional)
 8351        """
 8352
 8353        # SnpEff annotation field
 8354        snpeff_hgvs = "snpeff_ann_explode"
 8355
 8356        # Snpeff hgvs tags
 8357        vcf_infos_tags = {
 8358            snpeff_hgvs: "Explode snpEff annotations",
 8359        }
 8360
 8361        # Prefix
 8362        prefix = self.get_explode_infos_prefix()
 8363        if prefix:
 8364            prefix = "INFO/"
 8365
 8366        # snpEff fields
 8367        speff_ann_infos = prefix + snpeff_field
 8368        speff_hgvs_infos = prefix + snpeff_hgvs
 8369
 8370        # Variants table
 8371        table_variants = self.get_table_variants()
 8372
 8373        # Header
 8374        vcf_reader = self.get_header()
 8375
 8376        # Add columns
 8377        added_columns = []
 8378
 8379        # Explode HGVS field in column
 8380        added_columns += self.explode_infos(fields=[snpeff_field])
 8381        log.debug(f"snpeff_field={snpeff_field}")
 8382        log.debug(f"added_columns={added_columns}")
 8383
 8384        if snpeff_field in vcf_reader.infos:
 8385
 8386            # Extract ANN header
 8387            ann_description = vcf_reader.infos[snpeff_field].desc
 8388            pattern = r"'(.+?)'"
 8389            match = re.search(pattern, ann_description)
 8390            if match:
 8391                ann_header_match = match.group(1).split(" | ")
 8392                ann_header = []
 8393                ann_header_desc = {}
 8394                for i in range(len(ann_header_match)):
 8395                    ann_header_info = "".join(
 8396                        char for char in ann_header_match[i] if char.isalnum()
 8397                    )
 8398                    ann_header.append(ann_header_info)
 8399                    ann_header_desc[ann_header_info] = ann_header_match[i]
 8400                if not ann_header_desc:
 8401                    raise ValueError("Invalid header description format")
 8402            else:
 8403                raise ValueError("Invalid header description format")
 8404
 8405            # Create variant id
 8406            variant_id_column = self.get_variant_id_column()
 8407            added_columns += [variant_id_column]
 8408
 8409            # Create dataframe
 8410            dataframe_snpeff_hgvs = self.get_query_to_df(
 8411                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
 8412            )
 8413
 8414            # Create snpEff columns
 8415            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
 8416                speff_ann_infos
 8417            ].apply(
 8418                lambda x: explode_snpeff_ann(
 8419                    str(x),
 8420                    uniquify=uniquify,
 8421                    output_format=output_format,
 8422                    prefix=output_prefix,
 8423                    header=list(ann_header_desc.values()),
 8424                )
 8425            )
 8426
 8427            # Header
 8428            ann_annotations_prefix = ""
 8429            if output_format.upper() in ["JSON"]:
 8430                ann_annotations_prefix = f"{output_prefix}="
 8431                vcf_reader.infos[output_prefix] = vcf.parser._Info(
 8432                    output_prefix,
 8433                    ".",
 8434                    "String",
 8435                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8436                    + " - JSON format",
 8437                    "howard calculation",
 8438                    "0",
 8439                    self.code_type_map.get("String"),
 8440                )
 8441            else:
 8442                for ann_annotation in ann_header:
 8443                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
 8444                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
 8445                        ann_annotation_id,
 8446                        ".",
 8447                        "String",
 8448                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
 8449                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
 8450                        "howard calculation",
 8451                        "0",
 8452                        self.code_type_map.get("String"),
 8453                    )
 8454
 8455            # Update
 8456            sql_update = f"""
 8457                UPDATE variants
 8458                SET "INFO" = 
 8459                    concat(
 8460                        CASE
 8461                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8462                            THEN ''
 8463                            ELSE concat("INFO", ';')
 8464                        END,
 8465                        CASE 
 8466                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
 8467                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
 8468                            THEN concat(
 8469                                '{ann_annotations_prefix}',
 8470                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
 8471                                )
 8472                            ELSE ''
 8473                        END
 8474                    )
 8475                FROM dataframe_snpeff_hgvs
 8476                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
 8477
 8478            """
 8479            self.conn.execute(sql_update)
 8480
 8481            # Delete dataframe
 8482            del dataframe_snpeff_hgvs
 8483            gc.collect()
 8484
 8485        else:
 8486
 8487            log.warning(
 8488                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
 8489            )
 8490
 8491        # Remove added columns
 8492        for added_column in added_columns:
 8493            self.drop_column(column=added_column)
 8494
 8495    def calculation_extract_nomen(self) -> None:
 8496        """
 8497        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
 8498        """
 8499
 8500        # NOMEN field
 8501        field_nomen_dict = "NOMEN_DICT"
 8502
 8503        # NOMEN structure
 8504        nomen_dict = {
 8505            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
 8506            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
 8507            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
 8508            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
 8509            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
 8510            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
 8511            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
 8512            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
 8513            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
 8514            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
 8515        }
 8516
 8517        # Param
 8518        param = self.get_param()
 8519
 8520        # Prefix
 8521        prefix = self.get_explode_infos_prefix()
 8522
 8523        # Header
 8524        vcf_reader = self.get_header()
 8525
 8526        # Get HGVS field
 8527        hgvs_field = (
 8528            param.get("calculation", {})
 8529            .get("calculations", {})
 8530            .get("NOMEN", {})
 8531            .get("options", {})
 8532            .get("hgvs_field", "hgvs")
 8533        )
 8534
 8535        # Get transcripts
 8536        transcripts_file = (
 8537            param.get("calculation", {})
 8538            .get("calculations", {})
 8539            .get("NOMEN", {})
 8540            .get("options", {})
 8541            .get("transcripts", None)
 8542        )
 8543        transcripts_file = full_path(transcripts_file)
 8544        transcripts = []
 8545        if transcripts_file:
 8546            if os.path.exists(transcripts_file):
 8547                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
 8548                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
 8549            else:
 8550                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
 8551                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
 8552
 8553        # Added columns
 8554        added_columns = []
 8555
 8556        # Explode HGVS field in column
 8557        added_columns += self.explode_infos(fields=[hgvs_field])
 8558
 8559        # extra infos
 8560        extra_infos = self.get_extra_infos()
 8561        extra_field = prefix + hgvs_field
 8562
 8563        if extra_field in extra_infos:
 8564
 8565            # Create dataframe
 8566            dataframe_hgvs = self.get_query_to_df(
 8567                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
 8568            )
 8569
 8570            # Create main NOMEN column
 8571            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
 8572                lambda x: find_nomen(str(x), transcripts=transcripts)
 8573            )
 8574
 8575            # Explode NOMEN Structure and create SQL set for update
 8576            sql_nomen_fields = []
 8577            for nomen_field in nomen_dict:
 8578
 8579                # Explode each field into a column
 8580                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
 8581                    lambda x: dict(x).get(nomen_field, "")
 8582                )
 8583
 8584                # Create VCF header field
 8585                vcf_reader.infos[nomen_field] = vcf.parser._Info(
 8586                    nomen_field,
 8587                    ".",
 8588                    "String",
 8589                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
 8590                    "howard calculation",
 8591                    "0",
 8592                    self.code_type_map.get("String"),
 8593                )
 8594                sql_nomen_fields.append(
 8595                    f"""
 8596                        CASE 
 8597                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
 8598                            THEN concat(
 8599                                    ';{nomen_field}=',
 8600                                    dataframe_hgvs."{nomen_field}"
 8601                                )
 8602                            ELSE ''
 8603                        END
 8604                    """
 8605                )
 8606
 8607            # SQL set for update
 8608            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
 8609
 8610            # Update
 8611            sql_update = f"""
 8612                UPDATE variants
 8613                SET "INFO" = 
 8614                    concat(
 8615                        CASE
 8616                            WHEN "INFO" IS NULL
 8617                            THEN ''
 8618                            ELSE "INFO"
 8619                        END,
 8620                        {sql_nomen_fields_set}
 8621                    )
 8622                FROM dataframe_hgvs
 8623                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
 8624                    AND variants."POS" = dataframe_hgvs."POS" 
 8625                    AND variants."REF" = dataframe_hgvs."REF"
 8626                    AND variants."ALT" = dataframe_hgvs."ALT"
 8627            """
 8628            self.conn.execute(sql_update)
 8629
 8630            # Delete dataframe
 8631            del dataframe_hgvs
 8632            gc.collect()
 8633
 8634        # Remove added columns
 8635        for added_column in added_columns:
 8636            self.drop_column(column=added_column)
 8637
 8638    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
 8639        """
 8640        The function `calculation_find_by_pipeline` performs a calculation to find the number of
 8641        pipeline/sample for a variant and updates the variant information in a VCF file.
 8642
 8643        :param tag: The `tag` parameter is a string that represents the annotation field for the
 8644        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
 8645        VCF header and to update the corresponding field in the variants table, defaults to
 8646        findbypipeline
 8647        :type tag: str (optional)
 8648        """
 8649
 8650        # if FORMAT and samples
 8651        if (
 8652            "FORMAT" in self.get_header_columns_as_list()
 8653            and self.get_header_sample_list()
 8654        ):
 8655
 8656            # findbypipeline annotation field
 8657            findbypipeline_tag = tag
 8658
 8659            # VCF infos tags
 8660            vcf_infos_tags = {
 8661                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
 8662            }
 8663
 8664            # Prefix
 8665            prefix = self.get_explode_infos_prefix()
 8666
 8667            # Field
 8668            findbypipeline_infos = prefix + findbypipeline_tag
 8669
 8670            # Variants table
 8671            table_variants = self.get_table_variants()
 8672
 8673            # Header
 8674            vcf_reader = self.get_header()
 8675
 8676            # Create variant id
 8677            variant_id_column = self.get_variant_id_column()
 8678            added_columns = [variant_id_column]
 8679
 8680            # variant_id, FORMAT and samples
 8681            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8682                self.get_header_sample_list()
 8683            )
 8684
 8685            # Create dataframe
 8686            dataframe_findbypipeline = self.get_query_to_df(
 8687                f""" SELECT {samples_fields} FROM {table_variants} """
 8688            )
 8689
 8690            # Create findbypipeline column
 8691            dataframe_findbypipeline[findbypipeline_infos] = (
 8692                dataframe_findbypipeline.apply(
 8693                    lambda row: findbypipeline(
 8694                        row, samples=self.get_header_sample_list()
 8695                    ),
 8696                    axis=1,
 8697                )
 8698            )
 8699
 8700            # Add snpeff_hgvs to header
 8701            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
 8702                findbypipeline_tag,
 8703                ".",
 8704                "String",
 8705                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
 8706                "howard calculation",
 8707                "0",
 8708                self.code_type_map.get("String"),
 8709            )
 8710
 8711            # Update
 8712            sql_update = f"""
 8713                UPDATE variants
 8714                SET "INFO" = 
 8715                    concat(
 8716                        CASE
 8717                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8718                            THEN ''
 8719                            ELSE concat("INFO", ';')
 8720                        END,
 8721                        CASE 
 8722                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
 8723                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
 8724                            THEN concat(
 8725                                    '{findbypipeline_tag}=',
 8726                                    dataframe_findbypipeline."{findbypipeline_infos}"
 8727                                )
 8728                            ELSE ''
 8729                        END
 8730                    )
 8731                FROM dataframe_findbypipeline
 8732                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
 8733            """
 8734            self.conn.execute(sql_update)
 8735
 8736            # Remove added columns
 8737            for added_column in added_columns:
 8738                self.drop_column(column=added_column)
 8739
 8740            # Delete dataframe
 8741            del dataframe_findbypipeline
 8742            gc.collect()
 8743
 8744    def calculation_genotype_concordance(self) -> None:
 8745        """
 8746        The function `calculation_genotype_concordance` calculates the genotype concordance for
 8747        multi-caller VCF files and updates the variant information in the database.
 8748        """
 8749
 8750        # if FORMAT and samples
 8751        if (
 8752            "FORMAT" in self.get_header_columns_as_list()
 8753            and self.get_header_sample_list()
 8754        ):
 8755
 8756            # genotypeconcordance annotation field
 8757            genotypeconcordance_tag = "genotypeconcordance"
 8758
 8759            # VCF infos tags
 8760            vcf_infos_tags = {
 8761                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
 8762            }
 8763
 8764            # Prefix
 8765            prefix = self.get_explode_infos_prefix()
 8766
 8767            # Field
 8768            genotypeconcordance_infos = prefix + genotypeconcordance_tag
 8769
 8770            # Variants table
 8771            table_variants = self.get_table_variants()
 8772
 8773            # Header
 8774            vcf_reader = self.get_header()
 8775
 8776            # Create variant id
 8777            variant_id_column = self.get_variant_id_column()
 8778            added_columns = [variant_id_column]
 8779
 8780            # variant_id, FORMAT and samples
 8781            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8782                self.get_header_sample_list()
 8783            )
 8784
 8785            # Create dataframe
 8786            dataframe_genotypeconcordance = self.get_query_to_df(
 8787                f""" SELECT {samples_fields} FROM {table_variants} """
 8788            )
 8789
 8790            # Create genotypeconcordance column
 8791            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
 8792                dataframe_genotypeconcordance.apply(
 8793                    lambda row: genotypeconcordance(
 8794                        row, samples=self.get_header_sample_list()
 8795                    ),
 8796                    axis=1,
 8797                )
 8798            )
 8799
 8800            # Add genotypeconcordance to header
 8801            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
 8802                genotypeconcordance_tag,
 8803                ".",
 8804                "String",
 8805                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
 8806                "howard calculation",
 8807                "0",
 8808                self.code_type_map.get("String"),
 8809            )
 8810
 8811            # Update
 8812            sql_update = f"""
 8813                UPDATE variants
 8814                SET "INFO" = 
 8815                    concat(
 8816                        CASE
 8817                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8818                            THEN ''
 8819                            ELSE concat("INFO", ';')
 8820                        END,
 8821                        CASE
 8822                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
 8823                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
 8824                            THEN concat(
 8825                                    '{genotypeconcordance_tag}=',
 8826                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
 8827                                )
 8828                            ELSE ''
 8829                        END
 8830                    )
 8831                FROM dataframe_genotypeconcordance
 8832                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
 8833            """
 8834            self.conn.execute(sql_update)
 8835
 8836            # Remove added columns
 8837            for added_column in added_columns:
 8838                self.drop_column(column=added_column)
 8839
 8840            # Delete dataframe
 8841            del dataframe_genotypeconcordance
 8842            gc.collect()
 8843
 8844    def calculation_barcode(self, tag: str = "barcode") -> None:
 8845        """
 8846        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
 8847        updates the INFO field in the file with the calculated barcode values.
 8848
 8849        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
 8850        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
 8851        the default tag name is set to "barcode", defaults to barcode
 8852        :type tag: str (optional)
 8853        """
 8854
 8855        # if FORMAT and samples
 8856        if (
 8857            "FORMAT" in self.get_header_columns_as_list()
 8858            and self.get_header_sample_list()
 8859        ):
 8860
 8861            # barcode annotation field
 8862            if not tag:
 8863                tag = "barcode"
 8864
 8865            # VCF infos tags
 8866            vcf_infos_tags = {
 8867                tag: "barcode calculation (VaRank)",
 8868            }
 8869
 8870            # Prefix
 8871            prefix = self.get_explode_infos_prefix()
 8872
 8873            # Field
 8874            barcode_infos = prefix + tag
 8875
 8876            # Variants table
 8877            table_variants = self.get_table_variants()
 8878
 8879            # Header
 8880            vcf_reader = self.get_header()
 8881
 8882            # Create variant id
 8883            variant_id_column = self.get_variant_id_column()
 8884            added_columns = [variant_id_column]
 8885
 8886            # variant_id, FORMAT and samples
 8887            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 8888                self.get_header_sample_list()
 8889            )
 8890
 8891            # Create dataframe
 8892            dataframe_barcode = self.get_query_to_df(
 8893                f""" SELECT {samples_fields} FROM {table_variants} """
 8894            )
 8895
 8896            # Create barcode column
 8897            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 8898                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
 8899            )
 8900
 8901            # Add barcode to header
 8902            vcf_reader.infos[tag] = vcf.parser._Info(
 8903                tag,
 8904                ".",
 8905                "String",
 8906                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
 8907                "howard calculation",
 8908                "0",
 8909                self.code_type_map.get("String"),
 8910            )
 8911
 8912            # Update
 8913            sql_update = f"""
 8914                UPDATE {table_variants}
 8915                SET "INFO" = 
 8916                    concat(
 8917                        CASE
 8918                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 8919                            THEN ''
 8920                            ELSE concat("INFO", ';')
 8921                        END,
 8922                        CASE
 8923                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
 8924                            AND dataframe_barcode."{barcode_infos}" NOT NULL
 8925                            THEN concat(
 8926                                    '{tag}=',
 8927                                    dataframe_barcode."{barcode_infos}"
 8928                                )
 8929                            ELSE ''
 8930                        END
 8931                    )
 8932                FROM dataframe_barcode
 8933                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 8934            """
 8935            self.conn.execute(sql_update)
 8936
 8937            # Remove added columns
 8938            for added_column in added_columns:
 8939                self.drop_column(column=added_column)
 8940
 8941            # Delete dataframe
 8942            del dataframe_barcode
 8943            gc.collect()
 8944
 8945    def calculation_barcode_family(self, tag: str = "BCF") -> None:
 8946        """
 8947        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
 8948        and updates the INFO field in the file with the calculated barcode values.
 8949
 8950        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
 8951        the barcode tag that will be added to the VCF file during the calculation process. If no value
 8952        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
 8953        :type tag: str (optional)
 8954        """
 8955
 8956        # if FORMAT and samples
 8957        if (
 8958            "FORMAT" in self.get_header_columns_as_list()
 8959            and self.get_header_sample_list()
 8960        ):
 8961
 8962            # barcode annotation field
 8963            if not tag:
 8964                tag = "BCF"
 8965
 8966            # VCF infos tags
 8967            vcf_infos_tags = {
 8968                tag: "barcode family calculation",
 8969                f"{tag}S": "barcode family samples",
 8970            }
 8971
 8972            # Param
 8973            param = self.get_param()
 8974            log.debug(f"param={param}")
 8975
 8976            # Prefix
 8977            prefix = self.get_explode_infos_prefix()
 8978
 8979            # PED param
 8980            ped = (
 8981                param.get("calculation", {})
 8982                .get("calculations", {})
 8983                .get("BARCODEFAMILY", {})
 8984                .get("family_pedigree", None)
 8985            )
 8986            log.debug(f"ped={ped}")
 8987
 8988            # Load PED
 8989            if ped:
 8990
 8991                # Pedigree is a file
 8992                if isinstance(ped, str) and os.path.exists(full_path(ped)):
 8993                    log.debug("Pedigree is file")
 8994                    with open(full_path(ped)) as ped:
 8995                        ped = json.load(ped)
 8996
 8997                # Pedigree is a string
 8998                elif isinstance(ped, str):
 8999                    log.debug("Pedigree is str")
 9000                    try:
 9001                        ped = json.loads(ped)
 9002                        log.debug("Pedigree is json str")
 9003                    except ValueError as e:
 9004                        ped_samples = ped.split(",")
 9005                        ped = {}
 9006                        for ped_sample in ped_samples:
 9007                            ped[ped_sample] = ped_sample
 9008
 9009                # Pedigree is a dict
 9010                elif isinstance(ped, dict):
 9011                    log.debug("Pedigree is dict")
 9012
 9013                # Pedigree is not well formatted
 9014                else:
 9015                    msg_error = "Pedigree not well formatted"
 9016                    log.error(msg_error)
 9017                    raise ValueError(msg_error)
 9018
 9019                # Construct list
 9020                ped_samples = list(ped.values())
 9021
 9022            else:
 9023                log.debug("Pedigree not defined. Take all samples")
 9024                ped_samples = self.get_header_sample_list()
 9025                ped = {}
 9026                for ped_sample in ped_samples:
 9027                    ped[ped_sample] = ped_sample
 9028
 9029            # Check pedigree
 9030            if not ped or len(ped) == 0:
 9031                msg_error = f"Error in pedigree: samples {ped_samples}"
 9032                log.error(msg_error)
 9033                raise ValueError(msg_error)
 9034
 9035            # Log
 9036            log.info(
 9037                "Calculation 'BARCODEFAMILY' - Samples: "
 9038                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
 9039            )
 9040            log.debug(f"ped_samples={ped_samples}")
 9041
 9042            # Field
 9043            barcode_infos = prefix + tag
 9044
 9045            # Variants table
 9046            table_variants = self.get_table_variants()
 9047
 9048            # Header
 9049            vcf_reader = self.get_header()
 9050
 9051            # Create variant id
 9052            variant_id_column = self.get_variant_id_column()
 9053            added_columns = [variant_id_column]
 9054
 9055            # variant_id, FORMAT and samples
 9056            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9057                ped_samples
 9058            )
 9059
 9060            # Create dataframe
 9061            dataframe_barcode = self.get_query_to_df(
 9062                f""" SELECT {samples_fields} FROM {table_variants} """
 9063            )
 9064
 9065            # Create barcode column
 9066            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
 9067                lambda row: barcode(row, samples=ped_samples), axis=1
 9068            )
 9069
 9070            # Add barcode family to header
 9071            # Add vaf_normalization to header
 9072            vcf_reader.formats[tag] = vcf.parser._Format(
 9073                id=tag,
 9074                num=".",
 9075                type="String",
 9076                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
 9077                type_code=self.code_type_map.get("String"),
 9078            )
 9079            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
 9080                id=f"{tag}S",
 9081                num=".",
 9082                type="String",
 9083                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
 9084                type_code=self.code_type_map.get("String"),
 9085            )
 9086
 9087            # Update
 9088            # for sample in ped_samples:
 9089            sql_update_set = []
 9090            for sample in self.get_header_sample_list() + ["FORMAT"]:
 9091                if sample in ped_samples:
 9092                    value = f'dataframe_barcode."{barcode_infos}"'
 9093                    value_samples = "'" + ",".join(ped_samples) + "'"
 9094                elif sample == "FORMAT":
 9095                    value = f"'{tag}'"
 9096                    value_samples = f"'{tag}S'"
 9097                else:
 9098                    value = "'.'"
 9099                    value_samples = "'.'"
 9100                format_regex = r"[a-zA-Z0-9\s]"
 9101                sql_update_set.append(
 9102                    f"""
 9103                        "{sample}" = 
 9104                        concat(
 9105                            CASE
 9106                                WHEN {table_variants}."{sample}" = './.'
 9107                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
 9108                                ELSE {table_variants}."{sample}"
 9109                            END,
 9110                            ':',
 9111                            {value},
 9112                            ':',
 9113                            {value_samples}
 9114                        )
 9115                    """
 9116                )
 9117
 9118            sql_update_set_join = ", ".join(sql_update_set)
 9119            sql_update = f"""
 9120                UPDATE {table_variants}
 9121                SET {sql_update_set_join}
 9122                FROM dataframe_barcode
 9123                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
 9124            """
 9125            self.conn.execute(sql_update)
 9126
 9127            # Remove added columns
 9128            for added_column in added_columns:
 9129                self.drop_column(column=added_column)
 9130
 9131            # Delete dataframe
 9132            del dataframe_barcode
 9133            gc.collect()
 9134
 9135    def calculation_trio(self) -> None:
 9136        """
 9137        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
 9138        information to the INFO field of each variant.
 9139        """
 9140
 9141        # if FORMAT and samples
 9142        if (
 9143            "FORMAT" in self.get_header_columns_as_list()
 9144            and self.get_header_sample_list()
 9145        ):
 9146
 9147            # trio annotation field
 9148            trio_tag = "trio"
 9149
 9150            # VCF infos tags
 9151            vcf_infos_tags = {
 9152                "trio": "trio calculation",
 9153            }
 9154
 9155            # Param
 9156            param = self.get_param()
 9157
 9158            # Prefix
 9159            prefix = self.get_explode_infos_prefix()
 9160
 9161            # Trio param
 9162            trio_ped = (
 9163                param.get("calculation", {})
 9164                .get("calculations", {})
 9165                .get("TRIO", {})
 9166                .get("trio_pedigree", None)
 9167            )
 9168
 9169            # Load trio
 9170            if trio_ped:
 9171
 9172                # Trio pedigree is a file
 9173                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
 9174                    log.debug("TRIO pedigree is file")
 9175                    with open(full_path(trio_ped)) as trio_ped:
 9176                        trio_ped = json.load(trio_ped)
 9177
 9178                # Trio pedigree is a string
 9179                elif isinstance(trio_ped, str):
 9180                    log.debug("TRIO pedigree is str")
 9181                    try:
 9182                        trio_ped = json.loads(trio_ped)
 9183                        log.debug("TRIO pedigree is json str")
 9184                    except ValueError as e:
 9185                        trio_samples = trio_ped.split(",")
 9186                        if len(trio_samples) == 3:
 9187                            trio_ped = {
 9188                                "father": trio_samples[0],
 9189                                "mother": trio_samples[1],
 9190                                "child": trio_samples[2],
 9191                            }
 9192                            log.debug("TRIO pedigree is list str")
 9193                        else:
 9194                            msg_error = "TRIO pedigree not well formatted"
 9195                            log.error(msg_error)
 9196                            raise ValueError(msg_error)
 9197
 9198                # Trio pedigree is a dict
 9199                elif isinstance(trio_ped, dict):
 9200                    log.debug("TRIO pedigree is dict")
 9201
 9202                # Trio pedigree is not well formatted
 9203                else:
 9204                    msg_error = "TRIO pedigree not well formatted"
 9205                    log.error(msg_error)
 9206                    raise ValueError(msg_error)
 9207
 9208                # Construct trio list
 9209                trio_samples = [
 9210                    trio_ped.get("father", ""),
 9211                    trio_ped.get("mother", ""),
 9212                    trio_ped.get("child", ""),
 9213                ]
 9214
 9215            else:
 9216                log.debug("TRIO pedigree not defined. Take the first 3 samples")
 9217                samples_list = self.get_header_sample_list()
 9218                if len(samples_list) >= 3:
 9219                    trio_samples = self.get_header_sample_list()[0:3]
 9220                    trio_ped = {
 9221                        "father": trio_samples[0],
 9222                        "mother": trio_samples[1],
 9223                        "child": trio_samples[2],
 9224                    }
 9225                else:
 9226                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
 9227                    log.error(msg_error)
 9228                    raise ValueError(msg_error)
 9229
 9230            # Check trio pedigree
 9231            if not trio_ped or len(trio_ped) != 3:
 9232                msg_error = f"Error in TRIO pedigree: {trio_ped}"
 9233                log.error(msg_error)
 9234                raise ValueError(msg_error)
 9235
 9236            # Log
 9237            log.info(
 9238                f"Calculation 'TRIO' - Samples: "
 9239                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
 9240            )
 9241
 9242            # Field
 9243            trio_infos = prefix + trio_tag
 9244
 9245            # Variants table
 9246            table_variants = self.get_table_variants()
 9247
 9248            # Header
 9249            vcf_reader = self.get_header()
 9250
 9251            # Create variant id
 9252            variant_id_column = self.get_variant_id_column()
 9253            added_columns = [variant_id_column]
 9254
 9255            # variant_id, FORMAT and samples
 9256            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9257                self.get_header_sample_list()
 9258            )
 9259
 9260            # Create dataframe
 9261            dataframe_trio = self.get_query_to_df(
 9262                f""" SELECT {samples_fields} FROM {table_variants} """
 9263            )
 9264
 9265            # Create trio column
 9266            dataframe_trio[trio_infos] = dataframe_trio.apply(
 9267                lambda row: trio(row, samples=trio_samples), axis=1
 9268            )
 9269
 9270            # Add trio to header
 9271            vcf_reader.infos[trio_tag] = vcf.parser._Info(
 9272                trio_tag,
 9273                ".",
 9274                "String",
 9275                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
 9276                "howard calculation",
 9277                "0",
 9278                self.code_type_map.get("String"),
 9279            )
 9280
 9281            # Update
 9282            sql_update = f"""
 9283                UPDATE {table_variants}
 9284                SET "INFO" = 
 9285                    concat(
 9286                        CASE
 9287                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9288                            THEN ''
 9289                            ELSE concat("INFO", ';')
 9290                        END,
 9291                        CASE
 9292                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
 9293                             AND dataframe_trio."{trio_infos}" NOT NULL
 9294                            THEN concat(
 9295                                    '{trio_tag}=',
 9296                                    dataframe_trio."{trio_infos}"
 9297                                )
 9298                            ELSE ''
 9299                        END
 9300                    )
 9301                FROM dataframe_trio
 9302                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
 9303            """
 9304            self.conn.execute(sql_update)
 9305
 9306            # Remove added columns
 9307            for added_column in added_columns:
 9308                self.drop_column(column=added_column)
 9309
 9310            # Delete dataframe
 9311            del dataframe_trio
 9312            gc.collect()
 9313
 9314    def calculation_vaf_normalization(self) -> None:
 9315        """
 9316        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
 9317        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
 9318        :return: The function does not return anything.
 9319        """
 9320
 9321        # if FORMAT and samples
 9322        if (
 9323            "FORMAT" in self.get_header_columns_as_list()
 9324            and self.get_header_sample_list()
 9325        ):
 9326
 9327            # vaf_normalization annotation field
 9328            vaf_normalization_tag = "VAF"
 9329
 9330            # VCF infos tags
 9331            vcf_infos_tags = {
 9332                "VAF": "VAF Variant Frequency",
 9333            }
 9334
 9335            # Prefix
 9336            prefix = self.get_explode_infos_prefix()
 9337
 9338            # Variants table
 9339            table_variants = self.get_table_variants()
 9340
 9341            # Header
 9342            vcf_reader = self.get_header()
 9343
 9344            # Do not calculate if VAF already exists
 9345            if "VAF" in vcf_reader.formats:
 9346                log.debug("VAF already on genotypes")
 9347                return
 9348
 9349            # Create variant id
 9350            variant_id_column = self.get_variant_id_column()
 9351            added_columns = [variant_id_column]
 9352
 9353            # variant_id, FORMAT and samples
 9354            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9355                f""" "{sample}" """ for sample in self.get_header_sample_list()
 9356            )
 9357
 9358            # Create dataframe
 9359            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
 9360            log.debug(f"query={query}")
 9361            dataframe_vaf_normalization = self.get_query_to_df(query=query)
 9362
 9363            vaf_normalization_set = []
 9364
 9365            # for each sample vaf_normalization
 9366            for sample in self.get_header_sample_list():
 9367                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
 9368                    lambda row: vaf_normalization(row, sample=sample), axis=1
 9369                )
 9370                vaf_normalization_set.append(
 9371                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
 9372                )
 9373
 9374            # Add VAF to FORMAT
 9375            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
 9376                "FORMAT"
 9377            ].apply(lambda x: str(x) + ":VAF")
 9378            vaf_normalization_set.append(
 9379                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
 9380            )
 9381
 9382            # Add vaf_normalization to header
 9383            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
 9384                id=vaf_normalization_tag,
 9385                num="1",
 9386                type="Float",
 9387                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
 9388                type_code=self.code_type_map.get("Float"),
 9389            )
 9390
 9391            # Create fields to add in INFO
 9392            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
 9393
 9394            # Update
 9395            sql_update = f"""
 9396                UPDATE {table_variants}
 9397                SET {sql_vaf_normalization_set}
 9398                FROM dataframe_vaf_normalization
 9399                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
 9400
 9401            """
 9402            self.conn.execute(sql_update)
 9403
 9404            # Remove added columns
 9405            for added_column in added_columns:
 9406                self.drop_column(column=added_column)
 9407
 9408            # Delete dataframe
 9409            del dataframe_vaf_normalization
 9410            gc.collect()
 9411
 9412    def calculation_genotype_stats(self, info: str = "VAF") -> None:
 9413        """
 9414        The `calculation_genotype_stats` function calculates genotype statistics for a given information
 9415        field in a VCF file and updates the INFO column of the variants table with the calculated
 9416        statistics.
 9417
 9418        :param info: The `info` parameter is a string that represents the type of information for which
 9419        genotype statistics are calculated. It is used to generate various VCF info tags for the
 9420        statistics, such as the number of occurrences, the list of values, the minimum value, the
 9421        maximum value, the mean, the median, defaults to VAF
 9422        :type info: str (optional)
 9423        """
 9424
 9425        # if FORMAT and samples
 9426        if (
 9427            "FORMAT" in self.get_header_columns_as_list()
 9428            and self.get_header_sample_list()
 9429        ):
 9430
 9431            # vaf_stats annotation field
 9432            vaf_stats_tag = info + "_stats"
 9433
 9434            # VCF infos tags
 9435            vcf_infos_tags = {
 9436                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
 9437                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
 9438                info + "_stats_min": f"genotype {info} Statistics - min {info}",
 9439                info + "_stats_max": f"genotype {info} Statistics - max {info}",
 9440                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
 9441                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
 9442                info
 9443                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
 9444            }
 9445
 9446            # Prefix
 9447            prefix = self.get_explode_infos_prefix()
 9448
 9449            # Field
 9450            vaf_stats_infos = prefix + vaf_stats_tag
 9451
 9452            # Variants table
 9453            table_variants = self.get_table_variants()
 9454
 9455            # Header
 9456            vcf_reader = self.get_header()
 9457
 9458            # Create variant id
 9459            variant_id_column = self.get_variant_id_column()
 9460            added_columns = [variant_id_column]
 9461
 9462            # variant_id, FORMAT and samples
 9463            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
 9464                self.get_header_sample_list()
 9465            )
 9466
 9467            # Create dataframe
 9468            dataframe_vaf_stats = self.get_query_to_df(
 9469                f""" SELECT {samples_fields} FROM {table_variants} """
 9470            )
 9471
 9472            # Create vaf_stats column
 9473            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
 9474                lambda row: genotype_stats(
 9475                    row, samples=self.get_header_sample_list(), info=info
 9476                ),
 9477                axis=1,
 9478            )
 9479
 9480            # List of vcf tags
 9481            sql_vaf_stats_fields = []
 9482
 9483            # Check all VAF stats infos
 9484            for stat in vcf_infos_tags:
 9485
 9486                # Extract stats
 9487                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
 9488                    lambda x: dict(x).get(stat, "")
 9489                )
 9490
 9491                # Add snpeff_hgvs to header
 9492                vcf_reader.infos[stat] = vcf.parser._Info(
 9493                    stat,
 9494                    ".",
 9495                    "String",
 9496                    vcf_infos_tags.get(stat, "genotype statistics"),
 9497                    "howard calculation",
 9498                    "0",
 9499                    self.code_type_map.get("String"),
 9500                )
 9501
 9502                if len(sql_vaf_stats_fields):
 9503                    sep = ";"
 9504                else:
 9505                    sep = ""
 9506
 9507                # Create fields to add in INFO
 9508                sql_vaf_stats_fields.append(
 9509                    f"""
 9510                        CASE
 9511                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
 9512                            THEN concat(
 9513                                    '{sep}{stat}=',
 9514                                    dataframe_vaf_stats."{stat}"
 9515                                )
 9516                            ELSE ''
 9517                        END
 9518                    """
 9519                )
 9520
 9521            # SQL set for update
 9522            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
 9523
 9524            # Update
 9525            sql_update = f"""
 9526                UPDATE {table_variants}
 9527                SET "INFO" = 
 9528                    concat(
 9529                        CASE
 9530                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9531                            THEN ''
 9532                            ELSE concat("INFO", ';')
 9533                        END,
 9534                        {sql_vaf_stats_fields_set}
 9535                    )
 9536                FROM dataframe_vaf_stats
 9537                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
 9538
 9539            """
 9540            self.conn.execute(sql_update)
 9541
 9542            # Remove added columns
 9543            for added_column in added_columns:
 9544                self.drop_column(column=added_column)
 9545
 9546            # Delete dataframe
 9547            del dataframe_vaf_stats
 9548            gc.collect()
 9549
 9550    def calculation_transcripts_annotation(
 9551        self, info_json: str = None, info_format: str = None
 9552    ) -> None:
 9553        """
 9554        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
 9555        field to it if transcripts are available.
 9556
 9557        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
 9558        is a string parameter that represents the information field to be used in the transcripts JSON.
 9559        It is used to specify the JSON format for the transcripts information. If no value is provided
 9560        when calling the method, it defaults to "
 9561        :type info_json: str
 9562        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
 9563        method is a string parameter that specifies the format of the information field to be used in
 9564        the transcripts JSON. It is used to define the format of the information field
 9565        :type info_format: str
 9566        """
 9567
 9568        # Create transcripts table
 9569        transcripts_table = self.create_transcript_view()
 9570
 9571        # Add info field
 9572        if transcripts_table:
 9573            self.transcript_view_to_variants(
 9574                transcripts_table=transcripts_table,
 9575                transcripts_info_field_json=info_json,
 9576                transcripts_info_field_format=info_format,
 9577            )
 9578        else:
 9579            log.info("No Transcripts to process. Check param.json file configuration")
 9580
 9581    def calculation_transcripts_prioritization(self) -> None:
 9582        """
 9583        The function `calculation_transcripts_prioritization` creates a transcripts table and
 9584        prioritizes transcripts based on certain criteria.
 9585        """
 9586
 9587        # Create transcripts table
 9588        transcripts_table = self.create_transcript_view()
 9589
 9590        # Add info field
 9591        if transcripts_table:
 9592            self.transcripts_prioritization(transcripts_table=transcripts_table)
 9593        else:
 9594            log.info("No Transcripts to process. Check param.json file configuration")
 9595
 9596    ###############
 9597    # Transcripts #
 9598    ###############
 9599
 9600    def transcripts_prioritization(
 9601        self, transcripts_table: str = None, param: dict = {}
 9602    ) -> bool:
 9603        """
 9604        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
 9605        and updates the variants table with the prioritized information.
 9606
 9607        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
 9608        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
 9609        This parameter is used to identify the table where the transcripts data is stored for the
 9610        prioritization process
 9611        :type transcripts_table: str
 9612        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
 9613        that contains various configuration settings for the prioritization process of transcripts. It
 9614        is used to customize the behavior of the prioritization algorithm and includes settings such as
 9615        the prefix for prioritization fields, default profiles, and other
 9616        :type param: dict
 9617        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
 9618        transcripts prioritization process is successfully completed, and `False` if there are any
 9619        issues or if no profile is defined for transcripts prioritization.
 9620        """
 9621
 9622        log.debug("Start transcripts prioritization...")
 9623
 9624        # Param
 9625        if not param:
 9626            param = self.get_param()
 9627
 9628        # Variants table
 9629        table_variants = self.get_table_variants()
 9630        log.debug(f"transcripts_table={transcripts_table}")
 9631        # Transcripts table
 9632        if transcripts_table is None:
 9633            log.debug(f"transcripts_table={transcripts_table}")
 9634            transcripts_table = self.create_transcript_view(
 9635                transcripts_table="transcripts", param=param
 9636            )
 9637            log.debug(f"transcripts_table={transcripts_table}")
 9638        if transcripts_table is None:
 9639            msg_err = "No Transcripts table availalble"
 9640            log.error(msg_err)
 9641            raise ValueError(msg_err)
 9642
 9643        # Get transcripts columns
 9644        columns_as_list_query = f"""
 9645            DESCRIBE {transcripts_table}
 9646        """
 9647        columns_as_list = list(
 9648            self.get_query_to_df(columns_as_list_query)["column_name"]
 9649        )
 9650
 9651        # Create INFO if not exists
 9652        if "INFO" not in columns_as_list:
 9653            query_add_info = f"""
 9654                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
 9655            """
 9656            self.execute_query(query_add_info)
 9657
 9658        # Prioritization param and Force only PZ Score and Flag
 9659        pz_param = param.get("transcripts", {}).get("prioritization", {})
 9660        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
 9661        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
 9662        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
 9663        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
 9664        pz_profile_default = (
 9665            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
 9666        )
 9667
 9668        # Exit if no profile
 9669        if pz_profile_default is None:
 9670            log.warning("No profile defined for transcripts prioritization")
 9671            return False
 9672
 9673        # Prioritization
 9674        prioritization_result = self.prioritization(
 9675            table=transcripts_table,
 9676            pz_param=param.get("transcripts", {}).get("prioritization", {}),
 9677        )
 9678        if not prioritization_result:
 9679            log.warning("Transcripts prioritization not processed")
 9680            return False
 9681
 9682        # Explode PZ fields
 9683        self.explode_infos(
 9684            table=transcripts_table,
 9685            fields=param.get("transcripts", {})
 9686            .get("prioritization", {})
 9687            .get("pzfields", []),
 9688        )
 9689
 9690        # Export Transcripts prioritization infos to variants table
 9691        query_update = f"""
 9692            WITH RankedTranscripts AS (
 9693                SELECT
 9694                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
 9695                    ROW_NUMBER() OVER (
 9696                        PARTITION BY "#CHROM", POS, REF, ALT
 9697                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
 9698                    ) AS rn
 9699                FROM
 9700                    {transcripts_table}
 9701            )
 9702            UPDATE {table_variants}
 9703                SET
 9704                INFO = CONCAT(CASE
 9705                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
 9706                            THEN ''
 9707                            ELSE concat("INFO", ';')
 9708                        END,
 9709                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
 9710                        )
 9711            FROM
 9712                RankedTranscripts
 9713            WHERE
 9714                rn = 1
 9715                AND variants."#CHROM" = RankedTranscripts."#CHROM"
 9716                AND variants."POS" = RankedTranscripts."POS"
 9717                AND variants."REF" = RankedTranscripts."REF"
 9718                AND variants."ALT" = RankedTranscripts."ALT"
 9719                
 9720        """
 9721        self.execute_query(query=query_update)
 9722
 9723        # Add PZ Transcript in header
 9724        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
 9725            pz_fields_transcripts,
 9726            ".",
 9727            "String",
 9728            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
 9729            "unknown",
 9730            "unknown",
 9731            code_type_map["String"],
 9732        )
 9733
 9734        # Return
 9735        return True
 9736
 9737    def create_transcript_view_from_columns_map(
 9738        self,
 9739        transcripts_table: str = "transcripts",
 9740        columns_maps: dict = {},
 9741        added_columns: list = [],
 9742        temporary_tables: list = None,
 9743        annotation_fields: list = None,
 9744    ) -> tuple[list, list, list]:
 9745        """
 9746        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
 9747        specified columns mapping for transcripts data.
 9748
 9749        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9750        the table where the transcripts data is stored or will be stored in the database. This table
 9751        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
 9752        predictions, etc. It defaults to "transcripts, defaults to transcripts
 9753        :type transcripts_table: str (optional)
 9754        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
 9755        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
 9756        represents a mapping configuration for a specific set of columns. It typically includes details such
 9757        as the main transcript column and additional information columns
 9758        :type columns_maps: dict
 9759        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
 9760        function is a list that stores the additional columns that will be added to the view being created
 9761        based on the columns map provided. These columns are generated by exploding the transcript
 9762        information columns along with the main transcript column
 9763        :type added_columns: list
 9764        :param temporary_tables: The `temporary_tables` parameter in the
 9765        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
 9766        tables created during the process of creating a transcript view from a columns map. These temporary
 9767        tables are used to store intermediate results or transformations before the final view is generated
 9768        :type temporary_tables: list
 9769        :param annotation_fields: The `annotation_fields` parameter in the
 9770        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
 9771        for annotation in the query view creation process. These fields are extracted from the
 9772        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
 9773        :type annotation_fields: list
 9774        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
 9775        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
 9776        """
 9777
 9778        log.debug("Start transcrpts view creation from columns map...")
 9779
 9780        # "from_columns_map": [
 9781        #     {
 9782        #         "transcripts_column": "Ensembl_transcriptid",
 9783        #         "transcripts_infos_columns": [
 9784        #             "genename",
 9785        #             "Ensembl_geneid",
 9786        #             "LIST_S2_score",
 9787        #             "LIST_S2_pred",
 9788        #         ],
 9789        #     },
 9790        #     {
 9791        #         "transcripts_column": "Ensembl_transcriptid",
 9792        #         "transcripts_infos_columns": [
 9793        #             "genename",
 9794        #             "VARITY_R_score",
 9795        #             "Aloft_pred",
 9796        #         ],
 9797        #     },
 9798        # ],
 9799
 9800        # Init
 9801        if temporary_tables is None:
 9802            temporary_tables = []
 9803        if annotation_fields is None:
 9804            annotation_fields = []
 9805
 9806        # Variants table
 9807        table_variants = self.get_table_variants()
 9808
 9809        for columns_map in columns_maps:
 9810
 9811            # Transcript column
 9812            transcripts_column = columns_map.get("transcripts_column", None)
 9813
 9814            # Transcripts infos columns
 9815            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
 9816
 9817            if transcripts_column is not None:
 9818
 9819                # Explode
 9820                added_columns += self.explode_infos(
 9821                    fields=[transcripts_column] + transcripts_infos_columns
 9822                )
 9823
 9824                # View clauses
 9825                clause_select = []
 9826                for field in [transcripts_column] + transcripts_infos_columns:
 9827                    clause_select.append(
 9828                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
 9829                    )
 9830                    if field not in [transcripts_column]:
 9831                        annotation_fields.append(field)
 9832
 9833                # Querey View
 9834                query = f""" 
 9835                    SELECT
 9836                        "#CHROM", POS, REF, ALT, INFO,
 9837                        "{transcripts_column}" AS 'transcript',
 9838                        {", ".join(clause_select)}
 9839                    FROM (
 9840                        SELECT 
 9841                            "#CHROM", POS, REF, ALT, INFO,
 9842                            {", ".join(clause_select)}
 9843                        FROM {table_variants}
 9844                        )
 9845                    WHERE "{transcripts_column}" IS NOT NULL
 9846                """
 9847
 9848                # Create temporary table
 9849                temporary_table = transcripts_table + "".join(
 9850                    random.choices(string.ascii_uppercase + string.digits, k=10)
 9851                )
 9852
 9853                # Temporary_tables
 9854                temporary_tables.append(temporary_table)
 9855                query_view = f"""
 9856                    CREATE TEMPORARY TABLE {temporary_table}
 9857                    AS ({query})
 9858                """
 9859                self.execute_query(query=query_view)
 9860
 9861        return added_columns, temporary_tables, annotation_fields
 9862
 9863    def create_transcript_view_from_column_format(
 9864        self,
 9865        transcripts_table: str = "transcripts",
 9866        column_formats: dict = {},
 9867        temporary_tables: list = None,
 9868        annotation_fields: list = None,
 9869    ) -> tuple[list, list, list]:
 9870        """
 9871        The `create_transcript_view_from_column_format` function generates a transcript view based on
 9872        specified column formats, adds additional columns and annotation fields, and returns the list of
 9873        temporary tables and annotation fields.
 9874
 9875        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
 9876        the table containing the transcripts data. This table will be used as the base table for creating
 9877        the transcript view. The default value for this parameter is "transcripts", but you can provide a
 9878        different table name if needed, defaults to transcripts
 9879        :type transcripts_table: str (optional)
 9880        :param column_formats: The `column_formats` parameter is a dictionary that contains information
 9881        about the columns to be used for creating the transcript view. Each entry in the dictionary
 9882        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
 9883        the provided code snippet:
 9884        :type column_formats: dict
 9885        :param temporary_tables: The `temporary_tables` parameter in the
 9886        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
 9887        views created during the process of creating a transcript view from a column format. These temporary
 9888        views are used to manipulate and extract data before generating the final transcript view. It
 9889        :type temporary_tables: list
 9890        :param annotation_fields: The `annotation_fields` parameter in the
 9891        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
 9892        that are extracted from the temporary views created during the process. These annotation fields are
 9893        obtained by querying the temporary views and extracting the column names excluding specific columns
 9894        like `#CH
 9895        :type annotation_fields: list
 9896        :return: The `create_transcript_view_from_column_format` function returns two lists:
 9897        `temporary_tables` and `annotation_fields`.
 9898        """
 9899
 9900        log.debug("Start transcrpts view creation from column format...")
 9901
 9902        #  "from_column_format": [
 9903        #     {
 9904        #         "transcripts_column": "ANN",
 9905        #         "transcripts_infos_column": "Feature_ID",
 9906        #     }
 9907        # ],
 9908
 9909        # Init
 9910        if temporary_tables is None:
 9911            temporary_tables = []
 9912        if annotation_fields is None:
 9913            annotation_fields = []
 9914
 9915        for column_format in column_formats:
 9916
 9917            # annotation field and transcript annotation field
 9918            annotation_field = column_format.get("transcripts_column", "ANN")
 9919            transcript_annotation = column_format.get(
 9920                "transcripts_infos_column", "Feature_ID"
 9921            )
 9922
 9923            # Temporary View name
 9924            temporary_view_name = transcripts_table + "".join(
 9925                random.choices(string.ascii_uppercase + string.digits, k=10)
 9926            )
 9927
 9928            # Create temporary view name
 9929            temporary_view_name = self.annotation_format_to_table(
 9930                uniquify=True,
 9931                annotation_field=annotation_field,
 9932                view_name=temporary_view_name,
 9933                annotation_id=transcript_annotation,
 9934            )
 9935
 9936            # Annotation fields
 9937            if temporary_view_name:
 9938                query_annotation_fields = f"""
 9939                    SELECT *
 9940                    FROM (
 9941                        DESCRIBE SELECT *
 9942                        FROM {temporary_view_name}
 9943                        )
 9944                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
 9945                """
 9946                df_annotation_fields = self.get_query_to_df(
 9947                    query=query_annotation_fields
 9948                )
 9949
 9950                # Add temporary view and annotation fields
 9951                temporary_tables.append(temporary_view_name)
 9952                annotation_fields += list(set(df_annotation_fields["column_name"]))
 9953
 9954        return temporary_tables, annotation_fields
 9955
 9956    def create_transcript_view(
 9957        self,
 9958        transcripts_table: str = None,
 9959        transcripts_table_drop: bool = True,
 9960        param: dict = {},
 9961    ) -> str:
 9962        """
 9963        The `create_transcript_view` function generates a transcript view by processing data from a
 9964        specified table based on provided parameters and structural information.
 9965
 9966        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9967        is used to specify the name of the table that will store the final transcript view data. If a table
 9968        name is not provided, the function will create a new table to store the transcript view data, and by
 9969        default,, defaults to transcripts
 9970        :type transcripts_table: str (optional)
 9971        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9972        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9973        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9974        the function will drop the existing transcripts table if it exists, defaults to True
 9975        :type transcripts_table_drop: bool (optional)
 9976        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9977        contains information needed to create a transcript view. It includes details such as the structure
 9978        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9979        the view. This parameter allows for flexibility and customization
 9980        :type param: dict
 9981        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9982        created or modified during the execution of the function.
 9983        """
 9984
 9985        log.debug("Start transcripts view creation...")
 9986
 9987        # Default
 9988        transcripts_table_default = "transcripts"
 9989
 9990        # Param
 9991        if not param:
 9992            param = self.get_param()
 9993
 9994        # Struct
 9995        struct = param.get("transcripts", {}).get("struct", None)
 9996
 9997        if struct:
 9998
 9999            # Transcripts table
10000            if transcripts_table is None:
10001                transcripts_table = param.get("transcripts", {}).get(
10002                    "table", transcripts_table_default
10003                )
10004
10005            # added_columns
10006            added_columns = []
10007
10008            # Temporary tables
10009            temporary_tables = []
10010
10011            # Annotation fields
10012            annotation_fields = []
10013
10014            # from columns map
10015            columns_maps = struct.get("from_columns_map", [])
10016            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10017                self.create_transcript_view_from_columns_map(
10018                    transcripts_table=transcripts_table,
10019                    columns_maps=columns_maps,
10020                    added_columns=added_columns,
10021                    temporary_tables=temporary_tables,
10022                    annotation_fields=annotation_fields,
10023                )
10024            )
10025            added_columns += added_columns_tmp
10026            temporary_tables += temporary_tables_tmp
10027            annotation_fields += annotation_fields_tmp
10028
10029            # from column format
10030            column_formats = struct.get("from_column_format", [])
10031            temporary_tables_tmp, annotation_fields_tmp = (
10032                self.create_transcript_view_from_column_format(
10033                    transcripts_table=transcripts_table,
10034                    column_formats=column_formats,
10035                    temporary_tables=temporary_tables,
10036                    annotation_fields=annotation_fields,
10037                )
10038            )
10039            temporary_tables += temporary_tables_tmp
10040            annotation_fields += annotation_fields_tmp
10041
10042            # Merge temporary tables query
10043            query_merge = ""
10044            for temporary_table in temporary_tables:
10045
10046                # First temporary table
10047                if not query_merge:
10048                    query_merge = f"""
10049                        SELECT * FROM {temporary_table}
10050                    """
10051                # other temporary table (using UNION)
10052                else:
10053                    query_merge += f"""
10054                        UNION BY NAME SELECT * FROM {temporary_table}
10055                    """
10056
10057            # Merge on transcript
10058            query_merge_on_transcripts_annotation_fields = []
10059            # Aggregate all annotations fields
10060            for annotation_field in set(annotation_fields):
10061                query_merge_on_transcripts_annotation_fields.append(
10062                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10063                )
10064            # Query for transcripts view
10065            query_merge_on_transcripts = f"""
10066                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10067                FROM ({query_merge})
10068                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10069            """
10070
10071            # Drop transcript view is necessary
10072            if transcripts_table_drop:
10073                query_drop = f"""
10074                    DROP TABLE IF EXISTS {transcripts_table};
10075                """
10076                self.execute_query(query=query_drop)
10077
10078            # Merge and create transcript view
10079            query_create_view = f"""
10080                CREATE TABLE IF NOT EXISTS {transcripts_table}
10081                AS {query_merge_on_transcripts}
10082            """
10083            self.execute_query(query=query_create_view)
10084
10085            # Remove added columns
10086            for added_column in added_columns:
10087                self.drop_column(column=added_column)
10088
10089        else:
10090
10091            transcripts_table = None
10092
10093        return transcripts_table
10094
10095    def annotation_format_to_table(
10096        self,
10097        uniquify: bool = True,
10098        annotation_field: str = "ANN",
10099        annotation_id: str = "Feature_ID",
10100        view_name: str = "transcripts",
10101    ) -> str:
10102        """
10103        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10104        table format.
10105
10106        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10107        values in the output or not. If set to `True`, the function will make sure that the output values
10108        are unique, defaults to True
10109        :type uniquify: bool (optional)
10110        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10111        contains the annotation information for each variant. This field is used to extract the annotation
10112        details for further processing in the function, defaults to ANN
10113        :type annotation_field: str (optional)
10114        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10115        used to specify the identifier for the annotation feature. This identifier will be used as a column
10116        name in the resulting table or view that is created based on the annotation data. It helps in
10117        uniquely identifying each annotation entry in the, defaults to Feature_ID
10118        :type annotation_id: str (optional)
10119        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10120        specify the name of the temporary table that will be created to store the transformed annotation
10121        data. This table will hold the extracted information from the annotation field in a structured
10122        format for further processing or analysis, defaults to transcripts
10123        :type view_name: str (optional)
10124        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10125        is stored in the variable `view_name`.
10126        """
10127
10128        # Annotation field
10129        annotation_format = "annotation_explode"
10130
10131        # Transcript annotation
10132        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10133
10134        # Prefix
10135        prefix = self.get_explode_infos_prefix()
10136        if prefix:
10137            prefix = "INFO/"
10138
10139        # Annotation fields
10140        annotation_infos = prefix + annotation_field
10141        annotation_format_infos = prefix + annotation_format
10142
10143        # Variants table
10144        table_variants = self.get_table_variants()
10145
10146        # Header
10147        vcf_reader = self.get_header()
10148
10149        # Add columns
10150        added_columns = []
10151
10152        # Explode HGVS field in column
10153        added_columns += self.explode_infos(fields=[annotation_field])
10154
10155        if annotation_field in vcf_reader.infos:
10156
10157            # Extract ANN header
10158            ann_description = vcf_reader.infos[annotation_field].desc
10159            pattern = r"'(.+?)'"
10160            match = re.search(pattern, ann_description)
10161            if match:
10162                ann_header_match = match.group(1).split(" | ")
10163                ann_header = []
10164                ann_header_desc = {}
10165                for i in range(len(ann_header_match)):
10166                    ann_header_info = "".join(
10167                        char for char in ann_header_match[i] if char.isalnum()
10168                    )
10169                    ann_header.append(ann_header_info)
10170                    ann_header_desc[ann_header_info] = ann_header_match[i]
10171                if not ann_header_desc:
10172                    raise ValueError("Invalid header description format")
10173            else:
10174                raise ValueError("Invalid header description format")
10175
10176            # Create variant id
10177            variant_id_column = self.get_variant_id_column()
10178            added_columns += [variant_id_column]
10179
10180            # Create dataframe
10181            dataframe_annotation_format = self.get_query_to_df(
10182                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10183            )
10184
10185            # Create annotation columns
10186            dataframe_annotation_format[
10187                annotation_format_infos
10188            ] = dataframe_annotation_format[annotation_infos].apply(
10189                lambda x: explode_annotation_format(
10190                    annotation=str(x),
10191                    uniquify=uniquify,
10192                    output_format="JSON",
10193                    prefix="",
10194                    header=list(ann_header_desc.values()),
10195                )
10196            )
10197
10198            # Find keys
10199            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10200            df_keys = self.get_query_to_df(query=query_json)
10201
10202            # Check keys
10203            query_json_key = []
10204            for _, row in df_keys.iterrows():
10205
10206                # Key
10207                key = row.iloc[0]
10208
10209                # key_clean
10210                key_clean = "".join(char for char in key if char.isalnum())
10211
10212                # Type
10213                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10214
10215                # Get DataFrame from query
10216                df_json_type = self.get_query_to_df(query=query_json_type)
10217
10218                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10219                with pd.option_context("future.no_silent_downcasting", True):
10220                    df_json_type.fillna(value="", inplace=True)
10221                    replace_dict = {None: np.nan, "": np.nan}
10222                    df_json_type.replace(replace_dict, inplace=True)
10223                    df_json_type.dropna(inplace=True)
10224
10225                # Detect column type
10226                column_type = detect_column_type(df_json_type[key_clean])
10227
10228                # Append
10229                query_json_key.append(
10230                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10231                )
10232
10233            # Create view
10234            query_view = f"""
10235                CREATE TEMPORARY TABLE {view_name}
10236                AS (
10237                    SELECT *, {annotation_id} AS 'transcript'
10238                    FROM (
10239                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10240                        FROM dataframe_annotation_format
10241                        )
10242                    );
10243            """
10244            self.execute_query(query=query_view)
10245
10246        else:
10247
10248            # Return None
10249            view_name = None
10250
10251        # Remove added columns
10252        for added_column in added_columns:
10253            self.drop_column(column=added_column)
10254
10255        return view_name
10256
10257    def transcript_view_to_variants(
10258        self,
10259        transcripts_table: str = None,
10260        transcripts_column_id: str = None,
10261        transcripts_info_json: str = None,
10262        transcripts_info_field_json: str = None,
10263        transcripts_info_format: str = None,
10264        transcripts_info_field_format: str = None,
10265        param: dict = {},
10266    ) -> bool:
10267        """
10268        The `transcript_view_to_variants` function updates a variants table with information from
10269        transcripts in JSON format.
10270
10271        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10272        table containing the transcripts data. If this parameter is not provided, the function will
10273        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10274        :type transcripts_table: str
10275        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10276        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10277        identifier is used to match transcripts with variants in the database
10278        :type transcripts_column_id: str
10279        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10280        of the column in the variants table where the transcripts information will be stored in JSON
10281        format. This parameter allows you to define the column in the variants table that will hold the
10282        JSON-formatted information about transcripts
10283        :type transcripts_info_json: str
10284        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10285        specify the field in the VCF header that will contain information about transcripts in JSON
10286        format. This field will be added to the VCF header as an INFO field with the specified name
10287        :type transcripts_info_field_json: str
10288        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10289        format of the information about transcripts that will be stored in the variants table. This
10290        format can be used to define how the transcript information will be structured or displayed
10291        within the variants table
10292        :type transcripts_info_format: str
10293        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10294        specify the field in the VCF header that will contain information about transcripts in a
10295        specific format. This field will be added to the VCF header as an INFO field with the specified
10296        name
10297        :type transcripts_info_field_format: str
10298        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10299        that contains various configuration settings related to transcripts. It is used to provide
10300        default values for certain parameters if they are not explicitly provided when calling the
10301        method. The `param` dictionary can be passed as an argument
10302        :type param: dict
10303        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10304        if the operation is successful and `False` if certain conditions are not met.
10305        """
10306
10307        msg_info_prefix = "Start transcripts view to variants annotations"
10308
10309        log.debug(f"{msg_info_prefix}...")
10310
10311        # Default
10312        transcripts_table_default = "transcripts"
10313        transcripts_column_id_default = "transcript"
10314        transcripts_info_json_default = None
10315        transcripts_info_format_default = None
10316        transcripts_info_field_json_default = None
10317        transcripts_info_field_format_default = None
10318
10319        # Param
10320        if not param:
10321            param = self.get_param()
10322
10323        # Transcripts table
10324        if transcripts_table is None:
10325            transcripts_table = param.get("transcripts", {}).get(
10326                "table", transcripts_table_default
10327            )
10328
10329        # Transcripts column ID
10330        if transcripts_column_id is None:
10331            transcripts_column_id = param.get("transcripts", {}).get(
10332                "column_id", transcripts_column_id_default
10333            )
10334
10335        # Transcripts info json
10336        if transcripts_info_json is None:
10337            transcripts_info_json = param.get("transcripts", {}).get(
10338                "transcripts_info_json", transcripts_info_json_default
10339            )
10340
10341        # Transcripts info field JSON
10342        if transcripts_info_field_json is None:
10343            transcripts_info_field_json = param.get("transcripts", {}).get(
10344                "transcripts_info_field_json", transcripts_info_field_json_default
10345            )
10346        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10347        #     transcripts_info_json = transcripts_info_field_json
10348
10349        # Transcripts info format
10350        if transcripts_info_format is None:
10351            transcripts_info_format = param.get("transcripts", {}).get(
10352                "transcripts_info_format", transcripts_info_format_default
10353            )
10354
10355        # Transcripts info field FORMAT
10356        if transcripts_info_field_format is None:
10357            transcripts_info_field_format = param.get("transcripts", {}).get(
10358                "transcripts_info_field_format", transcripts_info_field_format_default
10359            )
10360        # if (
10361        #     transcripts_info_field_format is not None
10362        #     and transcripts_info_format is None
10363        # ):
10364        #     transcripts_info_format = transcripts_info_field_format
10365
10366        # Variants table
10367        table_variants = self.get_table_variants()
10368
10369        # Check info columns param
10370        if (
10371            transcripts_info_json is None
10372            and transcripts_info_field_json is None
10373            and transcripts_info_format is None
10374            and transcripts_info_field_format is None
10375        ):
10376            return False
10377
10378        # Transcripts infos columns
10379        query_transcripts_infos_columns = f"""
10380            SELECT *
10381            FROM (
10382                DESCRIBE SELECT * FROM {transcripts_table}
10383                )
10384            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10385        """
10386        transcripts_infos_columns = list(
10387            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10388        )
10389
10390        # View results
10391        clause_select = []
10392        clause_to_json = []
10393        clause_to_format = []
10394        for field in transcripts_infos_columns:
10395            clause_select.append(
10396                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10397            )
10398            clause_to_json.append(f""" '{field}': "{field}" """)
10399            clause_to_format.append(f""" "{field}" """)
10400
10401        # Update
10402        update_set_json = []
10403        update_set_format = []
10404
10405        # VCF header
10406        vcf_reader = self.get_header()
10407
10408        # Transcripts to info column in JSON
10409        if transcripts_info_json is not None:
10410
10411            # Create column on variants table
10412            self.add_column(
10413                table_name=table_variants,
10414                column_name=transcripts_info_json,
10415                column_type="JSON",
10416                default_value=None,
10417                drop=False,
10418            )
10419
10420            # Add header
10421            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10422                transcripts_info_json,
10423                ".",
10424                "String",
10425                "Transcripts in JSON format",
10426                "unknwon",
10427                "unknwon",
10428                self.code_type_map["String"],
10429            )
10430
10431            # Add to update
10432            update_set_json.append(
10433                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10434            )
10435
10436        # Transcripts to info field in JSON
10437        if transcripts_info_field_json is not None:
10438
10439            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10440
10441            # Add to update
10442            update_set_json.append(
10443                f""" 
10444                    INFO = concat(
10445                            CASE
10446                                WHEN INFO NOT IN ('', '.')
10447                                THEN INFO
10448                                ELSE ''
10449                            END,
10450                            CASE
10451                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10452                                THEN concat(
10453                                    ';{transcripts_info_field_json}=',
10454                                    t.{transcripts_info_json}
10455                                )
10456                                ELSE ''
10457                            END
10458                            )
10459                """
10460            )
10461
10462            # Add header
10463            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10464                transcripts_info_field_json,
10465                ".",
10466                "String",
10467                "Transcripts in JSON format",
10468                "unknwon",
10469                "unknwon",
10470                self.code_type_map["String"],
10471            )
10472
10473        if update_set_json:
10474
10475            # Update query
10476            query_update = f"""
10477                UPDATE {table_variants}
10478                    SET {", ".join(update_set_json)}
10479                FROM
10480                (
10481                    SELECT
10482                        "#CHROM", POS, REF, ALT,
10483                            concat(
10484                            '{{',
10485                            string_agg(
10486                                '"' || "{transcripts_column_id}" || '":' ||
10487                                to_json(json_output)
10488                            ),
10489                            '}}'
10490                            )::JSON AS {transcripts_info_json}
10491                    FROM
10492                        (
10493                        SELECT
10494                            "#CHROM", POS, REF, ALT,
10495                            "{transcripts_column_id}",
10496                            to_json(
10497                                {{{",".join(clause_to_json)}}}
10498                            )::JSON AS json_output
10499                        FROM
10500                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10501                        WHERE "{transcripts_column_id}" IS NOT NULL
10502                        )
10503                    GROUP BY "#CHROM", POS, REF, ALT
10504                ) AS t
10505                WHERE {table_variants}."#CHROM" = t."#CHROM"
10506                    AND {table_variants}."POS" = t."POS"
10507                    AND {table_variants}."REF" = t."REF"
10508                    AND {table_variants}."ALT" = t."ALT"
10509            """
10510
10511            self.execute_query(query=query_update)
10512
10513        # Transcripts to info column in FORMAT
10514        if transcripts_info_format is not None:
10515
10516            # Create column on variants table
10517            self.add_column(
10518                table_name=table_variants,
10519                column_name=transcripts_info_format,
10520                column_type="VARCHAR",
10521                default_value=None,
10522                drop=False,
10523            )
10524
10525            # Add header
10526            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10527                transcripts_info_format,
10528                ".",
10529                "String",
10530                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10531                "unknwon",
10532                "unknwon",
10533                self.code_type_map["String"],
10534            )
10535
10536            # Add to update
10537            update_set_format.append(
10538                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10539            )
10540
10541        # Transcripts to info field in JSON
10542        if transcripts_info_field_format is not None:
10543
10544            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10545
10546            # Add to update
10547            update_set_format.append(
10548                f""" 
10549                    INFO = concat(
10550                            CASE
10551                                WHEN INFO NOT IN ('', '.')
10552                                THEN INFO
10553                                ELSE ''
10554                            END,
10555                            CASE
10556                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10557                                THEN concat(
10558                                    ';{transcripts_info_field_format}=',
10559                                    t.{transcripts_info_format}
10560                                )
10561                                ELSE ''
10562                            END
10563                            )
10564                """
10565            )
10566
10567            # Add header
10568            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10569                transcripts_info_field_format,
10570                ".",
10571                "String",
10572                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10573                "unknwon",
10574                "unknwon",
10575                self.code_type_map["String"],
10576            )
10577
10578        if update_set_format:
10579
10580            # Update query
10581            query_update = f"""
10582                UPDATE {table_variants}
10583                    SET {", ".join(update_set_format)}
10584                FROM
10585                (
10586                    SELECT
10587                        "#CHROM", POS, REF, ALT,
10588                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10589                    FROM 
10590                        (
10591                        SELECT
10592                            "#CHROM", POS, REF, ALT,
10593                            "{transcripts_column_id}",
10594                            concat(
10595                                "{transcripts_column_id}",
10596                                '|',
10597                                {", '|', ".join(clause_to_format)}
10598                            ) AS {transcripts_info_format}
10599                        FROM
10600                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10601                        )
10602                    GROUP BY "#CHROM", POS, REF, ALT
10603                ) AS t
10604                WHERE {table_variants}."#CHROM" = t."#CHROM"
10605                    AND {table_variants}."POS" = t."POS"
10606                    AND {table_variants}."REF" = t."REF"
10607                    AND {table_variants}."ALT" = t."ALT"
10608            """
10609
10610            self.execute_query(query=query_update)
10611
10612        return True
Variants( conn=None, input: str = None, output: str = None, config: dict = {}, param: dict = {}, load: bool = False)
36    def __init__(
37        self,
38        conn=None,
39        input: str = None,
40        output: str = None,
41        config: dict = {},
42        param: dict = {},
43        load: bool = False,
44    ) -> None:
45        """
46        The function `__init__` initializes the variables, sets the input, output, config, param, connexion and
47        header
48
49        :param conn: the connection to the database
50        :param input: the input file
51        :param output: the output file
52        :param config: a dictionary containing the configuration of the model
53        :param param: a dictionary containing the parameters of the model
54        """
55
56        # Init variables
57        self.init_variables()
58
59        # Input
60        self.set_input(input)
61
62        # Config
63        self.set_config(config)
64
65        # Param
66        self.set_param(param)
67
68        # Output
69        self.set_output(output)
70
71        # connexion
72        self.set_connexion(conn)
73
74        # Header
75        self.set_header()
76
77        # Samples
78        self.set_samples()
79
80        # Load data
81        if load:
82            self.load_data()

The function __init__ initializes the variables, sets the input, output, config, param, connexion and header

Parameters
  • conn: the connection to the database
  • input: the input file
  • output: the output file
  • config: a dictionary containing the configuration of the model
  • param: a dictionary containing the parameters of the model
def set_samples(self, samples: list = None) -> list:
 84    def set_samples(self, samples: list = None) -> list:
 85        """
 86        The function `set_samples` sets the samples attribute of an object to a provided list or
 87        retrieves it from a parameter dictionary.
 88
 89        :param samples: The `set_samples` method is a method of a class that takes a list of samples as
 90        input and sets the `samples` attribute of the class to the provided list. If no samples are
 91        provided, it tries to get the samples from the class's parameters using the `get_param` method
 92        :type samples: list
 93        :return: The `samples` list is being returned.
 94        """
 95
 96        if not samples:
 97            samples = self.get_param().get("samples", {}).get("list", None)
 98
 99        self.samples = samples
100
101        return samples

The function set_samples sets the samples attribute of an object to a provided list or retrieves it from a parameter dictionary.

Parameters
  • samples: The set_samples method is a method of a class that takes a list of samples as input and sets the samples attribute of the class to the provided list. If no samples are provided, it tries to get the samples from the class's parameters using the get_param method
Returns

The samples list is being returned.

def get_samples(self) -> list:
103    def get_samples(self) -> list:
104        """
105        This function returns a list of samples.
106        :return: The `get_samples` method is returning the `samples` attribute of the object.
107        """
108
109        return self.samples

This function returns a list of samples.

Returns

The get_samples method is returning the samples attribute of the object.

def get_samples_check(self) -> bool:
111    def get_samples_check(self) -> bool:
112        """
113        This function returns the value of the "check" key within the "samples" dictionary retrieved
114        from the parameters.
115        :return: The method `get_samples_check` is returning the value of the key "check" inside the
116        "samples" dictionary, which is nested inside the dictionary returned by the `get_param()`
117        method. If the key "check" is not found, it will return `False`.
118        """
119
120        return self.get_param().get("samples", {}).get("check", True)

This function returns the value of the "check" key within the "samples" dictionary retrieved from the parameters.

Returns

The method get_samples_check is returning the value of the key "check" inside the "samples" dictionary, which is nested inside the dictionary returned by the get_param() method. If the key "check" is not found, it will return False.

def set_input(self, input: str = None) -> None:
122    def set_input(self, input: str = None) -> None:
123        """
124        The function `set_input` takes a file name as input, extracts the name and extension, and sets
125        attributes in the class accordingly.
126
127        :param input: The `set_input` method in the provided code snippet is used to set attributes
128        related to the input file. Here's a breakdown of the parameters and their usage in the method:
129        :type input: str
130        """
131
132        if input and not isinstance(input, str):
133            try:
134                self.input = input.name
135            except:
136                log.error(f"Input file '{input} in bad format")
137                raise ValueError(f"Input file '{input} in bad format")
138        else:
139            self.input = input
140
141        # Input format
142        if input:
143            input_name, input_extension = os.path.splitext(self.input)
144            self.input_name = input_name
145            self.input_extension = input_extension
146            self.input_format = self.input_extension.replace(".", "")

The function set_input takes a file name as input, extracts the name and extension, and sets attributes in the class accordingly.

Parameters
  • input: The set_input method in the provided code snippet is used to set attributes related to the input file. Here's a breakdown of the parameters and their usage in the method:
def set_config(self, config: dict) -> None:
148    def set_config(self, config: dict) -> None:
149        """
150        The set_config function takes a config object and assigns it as the configuration object for the
151        class.
152
153        :param config: The `config` parameter in the `set_config` function is a dictionary object that
154        contains configuration settings for the class. When you call the `set_config` function with a
155        dictionary object as the argument, it will set that dictionary as the configuration object for
156        the class
157        :type config: dict
158        """
159
160        self.config = config

The set_config function takes a config object and assigns it as the configuration object for the class.

Parameters
  • config: The config parameter in the set_config function is a dictionary object that contains configuration settings for the class. When you call the set_config function with a dictionary object as the argument, it will set that dictionary as the configuration object for the class
def set_param(self, param: dict) -> None:
162    def set_param(self, param: dict) -> None:
163        """
164        This function sets a parameter object for the class based on the input dictionary.
165
166        :param param: The `set_param` method you provided takes a dictionary object as input and sets it
167        as the `param` attribute of the class instance
168        :type param: dict
169        """
170
171        self.param = param

This function sets a parameter object for the class based on the input dictionary.

Parameters
  • param: The set_param method you provided takes a dictionary object as input and sets it as the param attribute of the class instance
def init_variables(self) -> None:
173    def init_variables(self) -> None:
174        """
175        This function initializes the variables that will be used in the rest of the class
176        """
177
178        self.prefix = "howard"
179        self.table_variants = "variants"
180        self.dataframe = None
181
182        self.comparison_map = {
183            "gt": ">",
184            "gte": ">=",
185            "lt": "<",
186            "lte": "<=",
187            "equals": "=",
188            "contains": "SIMILAR TO",
189        }
190
191        self.code_type_map = {"Integer": 0, "String": 1, "Float": 2, "Flag": 3}
192
193        self.code_type_map_to_sql = {
194            "Integer": "INTEGER",
195            "String": "VARCHAR",
196            "Float": "FLOAT",
197            "Flag": "VARCHAR",
198        }
199
200        self.index_additionnal_fields = []

This function initializes the variables that will be used in the rest of the class

def get_indexing(self) -> bool:
202    def get_indexing(self) -> bool:
203        """
204        It returns the value of the key "indexing" in the dictionary. If the key is not present, it
205        returns False.
206        :return: The value of the indexing parameter.
207        """
208
209        return self.get_param().get("indexing", False)

It returns the value of the key "indexing" in the dictionary. If the key is not present, it returns False.

Returns

The value of the indexing parameter.

def get_connexion_config(self) -> dict:
211    def get_connexion_config(self) -> dict:
212        """
213        The function `get_connexion_config` returns a dictionary containing the configuration for a
214        connection, including the number of threads and memory limit.
215        :return: a dictionary containing the configuration for the Connexion library.
216        """
217
218        # config
219        config = self.get_config()
220
221        # Connexion config
222        connexion_config = {}
223        threads = self.get_threads()
224
225        # Threads
226        if threads:
227            connexion_config["threads"] = threads
228
229        # Memory
230        # if config.get("memory", None):
231        #     connexion_config["memory_limit"] = config.get("memory")
232        if self.get_memory():
233            connexion_config["memory_limit"] = self.get_memory()
234
235        # Temporary directory
236        if config.get("tmp", None):
237            connexion_config["temp_directory"] = config.get("tmp")
238
239        # Access
240        if config.get("access", None):
241            access = config.get("access")
242            if access in ["RO"]:
243                access = "READ_ONLY"
244            elif access in ["RW"]:
245                access = "READ_WRITE"
246            connexion_db = self.get_connexion_db()
247            if connexion_db in ":memory:":
248                access = "READ_WRITE"
249            connexion_config["access_mode"] = access
250
251        return connexion_config

The function get_connexion_config returns a dictionary containing the configuration for a connection, including the number of threads and memory limit.

Returns

a dictionary containing the configuration for the Connexion library.

def get_duckdb_settings(self) -> dict:
253    def get_duckdb_settings(self) -> dict:
254        """
255        The function `get_duckdb_settings` retrieves DuckDB settings from a configuration file or a
256        string.
257        :return: The function `get_duckdb_settings` returns a dictionary object `duckdb_settings_dict`.
258        """
259
260        # config
261        config = self.get_config()
262
263        # duckdb settings
264        duckdb_settings_dict = {}
265        if config.get("duckdb_settings", None):
266            duckdb_settings = config.get("duckdb_settings")
267            duckdb_settings = full_path(duckdb_settings)
268            # duckdb setting is a file
269            if os.path.exists(duckdb_settings):
270                with open(duckdb_settings) as json_file:
271                    duckdb_settings_dict = yaml.safe_load(json_file)
272            # duckdb settings is a string
273            else:
274                duckdb_settings_dict = json.loads(duckdb_settings)
275
276        return duckdb_settings_dict

The function get_duckdb_settings retrieves DuckDB settings from a configuration file or a string.

Returns

The function get_duckdb_settings returns a dictionary object duckdb_settings_dict.

def set_connexion_db(self) -> str:
278    def set_connexion_db(self) -> str:
279        """
280        The function `set_connexion_db` returns the appropriate database connection string based on the
281        input format and connection type.
282        :return: the value of the variable `connexion_db`.
283        """
284
285        # Default connexion db
286        default_connexion_db = ":memory:"
287
288        # Find connexion db
289        if self.get_input_format() in ["db", "duckdb"]:
290            connexion_db = self.get_input()
291        elif self.get_connexion_type() in ["memory", default_connexion_db, None]:
292            connexion_db = default_connexion_db
293        elif self.get_connexion_type() in ["tmpfile"]:
294            tmp_name = tempfile.mkdtemp(
295                prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".db"
296            )
297            connexion_db = f"{tmp_name}/tmp.db"
298        elif self.get_connexion_type() != "":
299            connexion_db = self.get_connexion_type()
300        else:
301            connexion_db = default_connexion_db
302
303        # Set connexion db
304        self.connexion_db = connexion_db
305
306        return connexion_db

The function set_connexion_db returns the appropriate database connection string based on the input format and connection type.

Returns

the value of the variable connexion_db.

def set_connexion(self, conn) -> None:
308    def set_connexion(self, conn) -> None:
309        """
310        The function `set_connexion` creates a connection to a database, with options for different
311        database formats and settings.
312
313        :param conn: The `conn` parameter in the `set_connexion` method is the connection to the
314        database. If a connection is not provided, a new connection to an in-memory database is created.
315        The method then proceeds to set up the connection based on the specified format (e.g., duckdb or
316        sqlite
317        """
318
319        # Connexion db
320        connexion_db = self.set_connexion_db()
321
322        # Connexion config
323        connexion_config = self.get_connexion_config()
324
325        # Connexion format
326        connexion_format = self.get_config().get("connexion_format", "duckdb")
327        # Set connexion format
328        self.connexion_format = connexion_format
329
330        # Connexion
331        if not conn:
332            if connexion_format in ["duckdb"]:
333                conn = duckdb.connect(connexion_db, config=connexion_config)
334                # duckDB settings
335                duckdb_settings = self.get_duckdb_settings()
336                if duckdb_settings:
337                    for setting in duckdb_settings:
338                        setting_value = duckdb_settings.get(setting)
339                        if isinstance(setting_value, str):
340                            setting_value = f"'{setting_value}'"
341                        conn.execute(f"PRAGMA {setting}={setting_value};")
342            elif connexion_format in ["sqlite"]:
343                conn = sqlite3.connect(connexion_db)
344
345        # Set connexion
346        self.conn = conn
347
348        # Log
349        log.debug(f"connexion_format: {connexion_format}")
350        log.debug(f"connexion_db: {connexion_db}")
351        log.debug(f"connexion config: {connexion_config}")
352        log.debug(f"connexion duckdb settings: {self.get_duckdb_settings()}")

The function set_connexion creates a connection to a database, with options for different database formats and settings.

Parameters
  • conn: The conn parameter in the set_connexion method is the connection to the database. If a connection is not provided, a new connection to an in-memory database is created. The method then proceeds to set up the connection based on the specified format (e.g., duckdb or sqlite
def set_output(self, output: str = None) -> None:
354    def set_output(self, output: str = None) -> None:
355        """
356        The `set_output` function in Python sets the output file based on the input or a specified key
357        in the config file, extracting the output name, extension, and format.
358
359        :param output: The `output` parameter in the `set_output` method is used to specify the name of
360        the output file. If the config file has an 'output' key, the method sets the output to the value
361        of that key. If no output is provided, it sets the output to `None`
362        :type output: str
363        """
364
365        if output and not isinstance(output, str):
366            self.output = output.name
367        else:
368            self.output = output
369
370        # Output format
371        if self.output:
372            output_name, output_extension = os.path.splitext(self.output)
373            self.output_name = output_name
374            self.output_extension = output_extension
375            self.output_format = self.output_extension.replace(".", "")
376        else:
377            self.output_name = None
378            self.output_extension = None
379            self.output_format = None

The set_output function in Python sets the output file based on the input or a specified key in the config file, extracting the output name, extension, and format.

Parameters
  • output: The output parameter in the set_output method is used to specify the name of the output file. If the config file has an 'output' key, the method sets the output to the value of that key. If no output is provided, it sets the output to None
def set_header(self) -> None:
381    def set_header(self) -> None:
382        """
383        It reads the header of a VCF file and stores it as a list of strings and as a VCF object
384        """
385
386        input_file = self.get_input()
387        default_header_list = [
388            "##fileformat=VCFv4.2",
389            "#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO",
390        ]
391
392        # Full path
393        input_file = full_path(input_file)
394
395        if input_file:
396
397            input_format = self.get_input_format()
398            input_compressed = self.get_input_compressed()
399            config = self.get_config()
400            header_list = default_header_list
401            if input_format in [
402                "vcf",
403                "hdr",
404                "tsv",
405                "csv",
406                "psv",
407                "parquet",
408                "db",
409                "duckdb",
410            ]:
411                # header provided in param
412                if config.get("header_file", None):
413                    with open(config.get("header_file"), "rt") as f:
414                        header_list = self.read_vcf_header(f)
415                # within a vcf file format (header within input file itsself)
416                elif input_format in ["vcf", "hdr"] and not os.path.isdir(input_file):
417                    # within a compressed vcf file format (.vcf.gz)
418                    if input_compressed:
419                        with bgzf.open(input_file, "rt") as f:
420                            header_list = self.read_vcf_header(f)
421                    # within an uncompressed vcf file format (.vcf)
422                    else:
423                        with open(input_file, "rt") as f:
424                            header_list = self.read_vcf_header(f)
425                # header provided in default external file .hdr
426                elif os.path.exists((input_file + ".hdr")):
427                    with open(input_file + ".hdr", "rt") as f:
428                        header_list = self.read_vcf_header(f)
429                else:
430                    try:  # Try to get header info fields and file columns
431
432                        with tempfile.TemporaryDirectory() as tmpdir:
433
434                            # Create database
435                            db_for_header = Database(database=input_file)
436
437                            # Get header columns for infos fields
438                            db_header_from_columns = (
439                                db_for_header.get_header_from_columns()
440                            )
441
442                            # Get real columns in the file
443                            db_header_columns = db_for_header.get_columns()
444
445                            # Write header file
446                            header_file_tmp = os.path.join(tmpdir, "header")
447                            f = open(header_file_tmp, "w")
448                            vcf.Writer(f, db_header_from_columns)
449                            f.close()
450
451                            # Replace #CHROM line with rel columns
452                            header_list = db_for_header.read_header_file(
453                                header_file=header_file_tmp
454                            )
455                            header_list[-1] = "\t".join(db_header_columns)
456
457                    except:
458
459                        log.warning(
460                            f"No header for file {input_file}. Set as default VCF header"
461                        )
462                        header_list = default_header_list
463
464            else:  # try for unknown format ?
465
466                log.error(f"Input file format '{input_format}' not available")
467                raise ValueError(f"Input file format '{input_format}' not available")
468
469            if not header_list:
470                header_list = default_header_list
471
472            # header as list
473            self.header_list = header_list
474
475            # header as VCF object
476            self.header_vcf = vcf.Reader(io.StringIO("\n".join(header_list)))
477
478        else:
479
480            self.header_list = None
481            self.header_vcf = None

It reads the header of a VCF file and stores it as a list of strings and as a VCF object

def get_query_to_df(self, query: str = '', limit: int = None) -> pandas.core.frame.DataFrame:
483    def get_query_to_df(self, query: str = "", limit: int = None) -> pd.DataFrame:
484        """
485        The `get_query_to_df` function takes a query as a string and returns the result as a pandas
486        DataFrame based on the connection format.
487
488        :param query: The `query` parameter in the `get_query_to_df` function is a string that
489        represents the SQL query you want to execute. This query will be used to fetch data from a
490        database and convert it into a pandas DataFrame
491        :type query: str
492        :param limit: The `limit` parameter in the `get_query_to_df` function is used to specify the
493        maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the
494        function will only fetch up to that number of rows from the database query result. If no limit
495        is specified,
496        :type limit: int
497        :return: A pandas DataFrame is being returned by the `get_query_to_df` function.
498        """
499
500        # Connexion format
501        connexion_format = self.get_connexion_format()
502
503        # Limit in query
504        if limit:
505            pd.set_option("display.max_rows", limit)
506            if connexion_format in ["duckdb"]:
507                df = (
508                    self.conn.execute(query)
509                    .fetch_record_batch(limit)
510                    .read_next_batch()
511                    .to_pandas()
512                )
513            elif connexion_format in ["sqlite"]:
514                df = next(pd.read_sql_query(query, self.conn, chunksize=limit))
515
516        # Full query
517        else:
518            if connexion_format in ["duckdb"]:
519                df = self.conn.execute(query).df()
520            elif connexion_format in ["sqlite"]:
521                df = pd.read_sql_query(query, self.conn)
522
523        return df

The get_query_to_df function takes a query as a string and returns the result as a pandas DataFrame based on the connection format.

Parameters
  • query: The query parameter in the get_query_to_df function is a string that represents the SQL query you want to execute. This query will be used to fetch data from a database and convert it into a pandas DataFrame
  • limit: The limit parameter in the get_query_to_df function is used to specify the maximum number of rows to be returned in the resulting dataframe. If a limit is provided, the function will only fetch up to that number of rows from the database query result. If no limit is specified,
Returns

A pandas DataFrame is being returned by the get_query_to_df function.

def get_overview(self) -> None:
525    def get_overview(self) -> None:
526        """
527        The function prints the input, output, config, and dataframe of the current object
528        """
529        table_variants_from = self.get_table_variants(clause="from")
530        sql_columns = self.get_header_columns_as_sql()
531        sql_query_export = f"SELECT {sql_columns} FROM {table_variants_from}"
532        df = self.get_query_to_df(sql_query_export)
533        log.info(
534            "Input:  "
535            + str(self.get_input())
536            + " ["
537            + str(str(self.get_input_format()))
538            + "]"
539        )
540        log.info(
541            "Output: "
542            + str(self.get_output())
543            + " ["
544            + str(str(self.get_output_format()))
545            + "]"
546        )
547        log.info("Config: ")
548        for d in str(json.dumps(self.get_config(), indent=4, sort_keys=True)).split(
549            "\n"
550        ):
551            log.info("\t" + str(d))
552        log.info("Param: ")
553        for d in str(json.dumps(self.get_param(), indent=4, sort_keys=True)).split(
554            "\n"
555        ):
556            log.info("\t" + str(d))
557        log.info("Sample list: " + str(self.get_header_sample_list()))
558        log.info("Dataframe: ")
559        for d in str(df).split("\n"):
560            log.info("\t" + str(d))
561
562        # garbage collector
563        del df
564        gc.collect()
565
566        return None

The function prints the input, output, config, and dataframe of the current object

def get_stats(self) -> dict:
568    def get_stats(self) -> dict:
569        """
570        The `get_stats` function calculates and returns various statistics of the current object,
571        including information about the input file, variants, samples, header fields, quality, and
572        SNVs/InDels.
573        :return: a dictionary containing various statistics of the current object. The dictionary has
574        the following structure:
575        """
576
577        # Log
578        log.info(f"Stats Calculation...")
579
580        # table varaints
581        table_variants_from = self.get_table_variants()
582
583        # stats dict
584        stats = {"Infos": {}}
585
586        ### File
587        input_file = self.get_input()
588        stats["Infos"]["Input file"] = input_file
589
590        # Header
591        header_infos = self.get_header().infos
592        header_formats = self.get_header().formats
593        header_infos_list = list(header_infos)
594        header_formats_list = list(header_formats)
595
596        ### Variants
597
598        stats["Variants"] = {}
599
600        # Variants by chr
601        sql_query_nb_variant_by_chrom = f'SELECT "#CHROM" as CHROM, count(*) as count FROM {table_variants_from} GROUP BY "#CHROM"'
602        df_nb_of_variants_by_chrom = self.get_query_to_df(sql_query_nb_variant_by_chrom)
603        nb_of_variants_by_chrom = df_nb_of_variants_by_chrom.sort_values(
604            by=["CHROM"], kind="quicksort"
605        )
606
607        # Total number of variants
608        nb_of_variants = nb_of_variants_by_chrom["count"].sum()
609
610        # Calculate percentage
611        nb_of_variants_by_chrom["percent"] = nb_of_variants_by_chrom["count"].apply(
612            lambda x: (x / nb_of_variants)
613        )
614
615        stats["Variants"]["Number of variants by chromosome"] = (
616            nb_of_variants_by_chrom.to_dict(orient="index")
617        )
618
619        stats["Infos"]["Number of variants"] = int(nb_of_variants)
620
621        ### Samples
622
623        # Init
624        samples = {}
625        nb_of_samples = 0
626
627        # Check Samples
628        if "GT" in header_formats_list and "FORMAT" in self.get_header_columns():
629            log.debug(f"Check samples...")
630            for sample in self.get_header_sample_list():
631                sql_query_samples = f"""
632                    SELECT  '{sample}' as sample,
633                            REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1) as genotype,
634                            count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1)) as count,
635                            concat((count(REGEXP_EXTRACT("{sample}", '^([0-9/|.]*)[:]*',1))/{nb_of_variants})) as percentage
636                    FROM {table_variants_from}
637                    WHERE (
638                        regexp_matches("{sample}", '^[0-9]([/|][0-9])+')
639                        AND
640                        len(string_split(CAST("FORMAT" AS VARCHAR), ':')) = len(string_split(CAST("{sample}" AS VARCHAR), ':'))
641                      )
642                    GROUP BY genotype
643                    """
644                sql_query_genotype_df = self.conn.execute(sql_query_samples).df()
645                sample_genotype_count = sql_query_genotype_df["count"].sum()
646                if len(sql_query_genotype_df):
647                    nb_of_samples += 1
648                    samples[f"{sample} - {sample_genotype_count} variants"] = (
649                        sql_query_genotype_df.to_dict(orient="index")
650                    )
651
652            stats["Samples"] = samples
653            stats["Infos"]["Number of samples"] = nb_of_samples
654
655        # #
656        # if "FORMAT" in self.get_header_columns() and "DP" in header_formats_list:
657        #     stats["Infos"]["Number of samples"] = nb_of_samples
658        # elif nb_of_samples:
659        #     stats["Infos"]["Number of samples"] = "not a VCF format"
660
661        ### INFO and FORMAT fields
662        header_types_df = {}
663        header_types_list = {
664            "List of INFO fields": header_infos,
665            "List of FORMAT fields": header_formats,
666        }
667        i = 0
668        for header_type in header_types_list:
669
670            header_type_infos = header_types_list.get(header_type)
671            header_infos_dict = {}
672
673            for info in header_type_infos:
674
675                i += 1
676                header_infos_dict[i] = {}
677
678                # ID
679                header_infos_dict[i]["id"] = info
680
681                # num
682                genotype_map = {None: ".", -1: "A", -2: "G", -3: "R"}
683                if header_type_infos[info].num in genotype_map.keys():
684                    header_infos_dict[i]["Number"] = genotype_map.get(
685                        header_type_infos[info].num
686                    )
687                else:
688                    header_infos_dict[i]["Number"] = header_type_infos[info].num
689
690                # type
691                if header_type_infos[info].type:
692                    header_infos_dict[i]["Type"] = header_type_infos[info].type
693                else:
694                    header_infos_dict[i]["Type"] = "."
695
696                # desc
697                if header_type_infos[info].desc != None:
698                    header_infos_dict[i]["Description"] = header_type_infos[info].desc
699                else:
700                    header_infos_dict[i]["Description"] = ""
701
702            if len(header_infos_dict):
703                header_types_df[header_type] = pd.DataFrame.from_dict(
704                    header_infos_dict, orient="index"
705                ).to_dict(orient="index")
706
707        # Stats
708        stats["Infos"]["Number of INFO fields"] = len(header_infos_list)
709        stats["Infos"]["Number of FORMAT fields"] = len(header_formats_list)
710        stats["Header"] = header_types_df
711
712        ### QUAL
713        if "QUAL" in self.get_header_columns():
714            sql_query_qual = f"""
715                    SELECT
716                        avg(CAST(QUAL AS INTEGER)) AS Average,
717                        min(CAST(QUAL AS INTEGER)) AS Minimum,
718                        max(CAST(QUAL AS INTEGER)) AS Maximum,
719                        stddev(CAST(QUAL AS INTEGER)) AS StandardDeviation,
720                        median(CAST(QUAL AS INTEGER)) AS Median,
721                        variance(CAST(QUAL AS INTEGER)) AS Variance
722                    FROM {table_variants_from}
723                    WHERE CAST(QUAL AS VARCHAR) NOT IN ('.')
724                    """
725
726            qual = self.conn.execute(sql_query_qual).df().to_dict(orient="index")
727            stats["Quality"] = {"Stats": qual}
728
729        ### SNV and InDel
730
731        sql_query_snv = f"""
732            
733            SELECT Type, count FROM (
734
735                    SELECT
736                        'Total' AS Type,
737                        count(*) AS count
738                    FROM {table_variants_from}
739
740                    UNION
741
742                    SELECT
743                        'MNV' AS Type,
744                        count(*) AS count
745                    FROM {table_variants_from}
746                    WHERE len(REF) > 1 AND len(ALT) > 1
747                    AND len(REF) = len(ALT)
748
749                    UNION
750
751                    SELECT
752                        'InDel' AS Type,
753                        count(*) AS count
754                    FROM {table_variants_from}
755                    WHERE len(REF) > 1 OR len(ALT) > 1
756                    AND len(REF) != len(ALT)
757                    
758                    UNION
759
760                    SELECT
761                        'SNV' AS Type,
762                        count(*) AS count
763                    FROM {table_variants_from}
764                    WHERE len(REF) = 1 AND len(ALT) = 1
765
766                )
767
768            ORDER BY count DESC
769
770                """
771        snv_indel = self.conn.execute(sql_query_snv).df().to_dict(orient="index")
772
773        sql_query_snv_substitution = f"""
774                SELECT
775                    concat(REF, '>', ALT) AS 'Substitution',
776                    count(*) AS count
777                FROM {table_variants_from}
778                WHERE len(REF) = 1 AND len(ALT) = 1
779                GROUP BY REF, ALT
780                ORDER BY count(*) DESC
781                """
782        snv_substitution = (
783            self.conn.execute(sql_query_snv_substitution).df().to_dict(orient="index")
784        )
785        stats["Variants"]["Counts"] = snv_indel
786        stats["Variants"]["Substitutions"] = snv_substitution
787
788        return stats

The get_stats function calculates and returns various statistics of the current object, including information about the input file, variants, samples, header fields, quality, and SNVs/InDels.

Returns

a dictionary containing various statistics of the current object. The dictionary has the following structure:

def stats_to_file(self, file: str = None) -> str:
790    def stats_to_file(self, file: str = None) -> str:
791        """
792        The function `stats_to_file` takes a file name as input, retrieves statistics, serializes them
793        into a JSON object, and writes the JSON object to the specified file.
794
795        :param file: The `file` parameter is a string that represents the file path where the JSON data
796        will be written
797        :type file: str
798        :return: the name of the file that was written to.
799        """
800
801        # Get stats
802        stats = self.get_stats()
803
804        # Serializing json
805        json_object = json.dumps(stats, indent=4)
806
807        # Writing to sample.json
808        with open(file, "w") as outfile:
809            outfile.write(json_object)
810
811        return file

The function stats_to_file takes a file name as input, retrieves statistics, serializes them into a JSON object, and writes the JSON object to the specified file.

Parameters
  • file: The file parameter is a string that represents the file path where the JSON data will be written
Returns

the name of the file that was written to.

def print_stats(self, output_file: str = None, json_file: str = None) -> None:
813    def print_stats(self, output_file: str = None, json_file: str = None) -> None:
814        """
815        The `print_stats` function generates a markdown file and prints the statistics contained in a
816        JSON file in a formatted manner.
817
818        :param output_file: The `output_file` parameter is a string that specifies the path and filename
819        of the output file where the stats will be printed in Markdown format. If no `output_file` is
820        provided, a temporary directory will be created and the stats will be saved in a file named
821        "stats.md" within that
822        :type output_file: str
823        :param json_file: The `json_file` parameter is a string that represents the path to the JSON
824        file where the statistics will be saved. If no value is provided, a temporary directory will be
825        created and a default file name "stats.json" will be used
826        :type json_file: str
827        :return: The function `print_stats` does not return any value. It has a return type annotation
828        of `None`.
829        """
830
831        # Full path
832        output_file = full_path(output_file)
833        json_file = full_path(json_file)
834
835        with tempfile.TemporaryDirectory() as tmpdir:
836
837            # Files
838            if not output_file:
839                output_file = os.path.join(tmpdir, "stats.md")
840            if not json_file:
841                json_file = os.path.join(tmpdir, "stats.json")
842
843            # Create folders
844            if not os.path.exists(os.path.dirname(output_file)):
845                Path(os.path.dirname(output_file)).mkdir(parents=True, exist_ok=True)
846            if not os.path.exists(os.path.dirname(json_file)):
847                Path(os.path.dirname(json_file)).mkdir(parents=True, exist_ok=True)
848
849            # Create stats JSON file
850            stats_file = self.stats_to_file(file=json_file)
851
852            # Print stats file
853            with open(stats_file) as f:
854                stats = yaml.safe_load(f)
855
856            # Output
857            output_title = []
858            output_index = []
859            output = []
860
861            # Title
862            output_title.append("# HOWARD Stats")
863
864            # Index
865            output_index.append("## Index")
866
867            # Process sections
868            for section in stats:
869                infos = stats.get(section)
870                section_link = "#" + section.lower().replace(" ", "-")
871                output.append(f"## {section}")
872                output_index.append(f"- [{section}]({section_link})")
873
874                if len(infos):
875                    for info in infos:
876                        try:
877                            df = pd.DataFrame.from_dict(infos.get(info), orient="index")
878                            is_df = True
879                        except:
880                            try:
881                                df = pd.DataFrame.from_dict(
882                                    json.loads((infos.get(info))), orient="index"
883                                )
884                                is_df = True
885                            except:
886                                is_df = False
887                        if is_df:
888                            output.append(f"### {info}")
889                            info_link = "#" + info.lower().replace(" ", "-")
890                            output_index.append(f"   - [{info}]({info_link})")
891                            output.append(f"{df.to_markdown(index=False)}")
892                        else:
893                            output.append(f"- {info}: {infos.get(info)}")
894                else:
895                    output.append(f"NA")
896
897            # Write stats in markdown file
898            with open(output_file, "w") as fp:
899                for item in output_title:
900                    fp.write("%s\n" % item)
901                for item in output_index:
902                    fp.write("%s\n" % item)
903                for item in output:
904                    fp.write("%s\n" % item)
905
906            # Output stats in markdown
907            print("")
908            print("\n\n".join(output_title))
909            print("")
910            print("\n\n".join(output))
911            print("")
912
913        return None

The print_stats function generates a markdown file and prints the statistics contained in a JSON file in a formatted manner.

Parameters
  • output_file: The output_file parameter is a string that specifies the path and filename of the output file where the stats will be printed in Markdown format. If no output_file is provided, a temporary directory will be created and the stats will be saved in a file named "stats.md" within that
  • json_file: The json_file parameter is a string that represents the path to the JSON file where the statistics will be saved. If no value is provided, a temporary directory will be created and a default file name "stats.json" will be used
Returns

The function print_stats does not return any value. It has a return type annotation of None.

def get_input(self) -> str:
915    def get_input(self) -> str:
916        """
917        It returns the value of the input variable.
918        :return: The input is being returned.
919        """
920        return self.input

It returns the value of the input variable.

Returns

The input is being returned.

def get_input_format(self, input_file: str = None) -> str:
922    def get_input_format(self, input_file: str = None) -> str:
923        """
924        This function returns the format of the input variable, either from the provided input file or
925        by prompting for input.
926
927        :param input_file: The `input_file` parameter in the `get_input_format` method is a string that
928        represents the file path of the input file. If no `input_file` is provided when calling the
929        method, it will default to `None`
930        :type input_file: str
931        :return: The format of the input variable is being returned.
932        """
933
934        if not input_file:
935            input_file = self.get_input()
936        input_format = get_file_format(input_file)
937        return input_format

This function returns the format of the input variable, either from the provided input file or by prompting for input.

Parameters
  • input_file: The input_file parameter in the get_input_format method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None
Returns

The format of the input variable is being returned.

def get_input_compressed(self, input_file: str = None) -> str:
939    def get_input_compressed(self, input_file: str = None) -> str:
940        """
941        The function `get_input_compressed` returns the format of the input variable after compressing
942        it.
943
944        :param input_file: The `input_file` parameter in the `get_input_compressed` method is a string
945        that represents the file path of the input file. If no `input_file` is provided when calling the
946        method, it will default to `None` and the method will then call `self.get_input()` to
947        :type input_file: str
948        :return: The function `get_input_compressed` returns the compressed format of the input
949        variable.
950        """
951
952        if not input_file:
953            input_file = self.get_input()
954        input_compressed = get_file_compressed(input_file)
955        return input_compressed

The function get_input_compressed returns the format of the input variable after compressing it.

Parameters
  • input_file: The input_file parameter in the get_input_compressed method is a string that represents the file path of the input file. If no input_file is provided when calling the method, it will default to None and the method will then call self.get_input() to
Returns

The function get_input_compressed returns the compressed format of the input variable.

def get_output(self) -> str:
957    def get_output(self) -> str:
958        """
959        It returns the output of the neuron.
960        :return: The output of the neural network.
961        """
962
963        return self.output

It returns the output of the neuron.

Returns

The output of the neural network.

def get_output_format(self, output_file: str = None) -> str:
965    def get_output_format(self, output_file: str = None) -> str:
966        """
967        The function `get_output_format` returns the format of the input variable or the output file if
968        provided.
969
970        :param output_file: The `output_file` parameter in the `get_output_format` method is a string
971        that represents the file path of the output file. If no `output_file` is provided when calling
972        the method, it will default to the output obtained from the `get_output` method of the class
973        instance. The
974        :type output_file: str
975        :return: The format of the input variable is being returned.
976        """
977
978        if not output_file:
979            output_file = self.get_output()
980        output_format = get_file_format(output_file)
981
982        return output_format

The function get_output_format returns the format of the input variable or the output file if provided.

Parameters
  • output_file: The output_file parameter in the get_output_format method is a string that represents the file path of the output file. If no output_file is provided when calling the method, it will default to the output obtained from the get_output method of the class instance. The
Returns

The format of the input variable is being returned.

def get_config(self) -> dict:
984    def get_config(self) -> dict:
985        """
986        It returns the config
987        :return: The config variable is being returned.
988        """
989        return self.config

It returns the config

Returns

The config variable is being returned.

def get_param(self) -> dict:
991    def get_param(self) -> dict:
992        """
993        It returns the param
994        :return: The param variable is being returned.
995        """
996        return self.param

It returns the param

Returns

The param variable is being returned.

def get_connexion_db(self) -> str:
 998    def get_connexion_db(self) -> str:
 999        """
1000        It returns the connexion_db attribute of the object
1001        :return: The connexion_db is being returned.
1002        """
1003        return self.connexion_db

It returns the connexion_db attribute of the object

Returns

The connexion_db is being returned.

def get_prefix(self) -> str:
1005    def get_prefix(self) -> str:
1006        """
1007        It returns the prefix of the object.
1008        :return: The prefix is being returned.
1009        """
1010        return self.prefix

It returns the prefix of the object.

Returns

The prefix is being returned.

def get_table_variants(self, clause: str = 'select') -> str:
1012    def get_table_variants(self, clause: str = "select") -> str:
1013        """
1014        This function returns the table_variants attribute of the object
1015
1016        :param clause: the type of clause the table will be used. Either "select" or "from" (optional),
1017        defaults to select (optional)
1018        :return: The table_variants attribute of the object.
1019        """
1020
1021        # Access
1022        access = self.get_config().get("access", None)
1023
1024        # Clauses "select", "where", "update"
1025        if clause in ["select", "where", "update"]:
1026            table_variants = self.table_variants
1027        # Clause "from"
1028        elif clause in ["from"]:
1029            # For Read Only
1030            if self.get_input_format() in ["parquet"] and access in ["RO"]:
1031                input_file = self.get_input()
1032                table_variants = f"'{input_file}' as variants"
1033            # For Read Write
1034            else:
1035                table_variants = f"{self.table_variants} as variants"
1036        else:
1037            table_variants = self.table_variants
1038        return table_variants

This function returns the table_variants attribute of the object

Parameters
  • clause: the type of clause the table will be used. Either "select" or "from" (optional), defaults to select (optional)
Returns

The table_variants attribute of the object.

def get_tmp_dir(self) -> str:
1040    def get_tmp_dir(self) -> str:
1041        """
1042        The function `get_tmp_dir` returns the temporary directory path based on configuration
1043        parameters or a default path.
1044        :return: The `get_tmp_dir` method is returning the temporary directory path based on the
1045        configuration, parameters, and a default value of "/tmp".
1046        """
1047
1048        return get_tmp(
1049            config=self.get_config(), param=self.get_param(), default_tmp="/tmp"
1050        )

The function get_tmp_dir returns the temporary directory path based on configuration parameters or a default path.

Returns

The get_tmp_dir method is returning the temporary directory path based on the configuration, parameters, and a default value of "/tmp".

def get_connexion_type(self) -> str:
1052    def get_connexion_type(self) -> str:
1053        """
1054        If the connexion type is not in the list of allowed connexion types, raise a ValueError
1055
1056        :return: The connexion type is being returned.
1057        """
1058        return self.get_config().get("connexion_type", "memory")

If the connexion type is not in the list of allowed connexion types, raise a ValueError

Returns

The connexion type is being returned.

def get_connexion(self):
1060    def get_connexion(self):
1061        """
1062        It returns the connection object
1063
1064        :return: The connection object.
1065        """
1066        return self.conn

It returns the connection object

Returns

The connection object.

def close_connexion(self) -> None:
1068    def close_connexion(self) -> None:
1069        """
1070        This function closes the connection to the database.
1071        :return: The connection is being closed.
1072        """
1073        return self.conn.close()

This function closes the connection to the database.

Returns

The connection is being closed.

def get_header(self, type: str = 'vcf'):
1075    def get_header(self, type: str = "vcf"):
1076        """
1077        This function returns the header of the VCF file as a list of strings
1078
1079        :param type: the type of header you want to get, defaults to vcf (optional)
1080        :return: The header of the vcf file.
1081        """
1082
1083        if self.header_vcf:
1084            if type == "vcf":
1085                return self.header_vcf
1086            elif type == "list":
1087                return self.header_list
1088        else:
1089            if type == "vcf":
1090                header = vcf.Reader(io.StringIO("\n".join(vcf_required)))
1091                return header
1092            elif type == "list":
1093                return vcf_required

This function returns the header of the VCF file as a list of strings

Parameters
  • type: the type of header you want to get, defaults to vcf (optional)
Returns

The header of the vcf file.

def get_header_length(self, file: str = None) -> int:
1095    def get_header_length(self, file: str = None) -> int:
1096        """
1097        The function `get_header_length` returns the length of the header list, excluding the #CHROM
1098        line.
1099
1100        :param file: The `file` parameter is an optional argument that specifies the path to a VCF
1101        header file. If this argument is provided, the function will read the header from the specified
1102        file and return the length of the header list minus 1 (to exclude the #CHROM line)
1103        :type file: str
1104        :return: the length of the header list, excluding the #CHROM line.
1105        """
1106
1107        if file:
1108            return len(self.read_vcf_header_file(file=file)) - 1
1109        elif self.get_header(type="list"):
1110            return len(self.get_header(type="list")) - 1
1111        else:
1112            return 0

The function get_header_length returns the length of the header list, excluding the #CHROM line.

Parameters
  • file: The file parameter is an optional argument that specifies the path to a VCF header file. If this argument is provided, the function will read the header from the specified file and return the length of the header list minus 1 (to exclude the #CHROM line)
Returns

the length of the header list, excluding the #CHROM line.

def get_header_columns(self) -> str:
1114    def get_header_columns(self) -> str:
1115        """
1116        This function returns the header list of a VCF
1117
1118        :return: The length of the header list.
1119        """
1120        if self.get_header():
1121            return self.get_header(type="list")[-1]
1122        else:
1123            return ""

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_list(self) -> list:
1125    def get_header_columns_as_list(self) -> list:
1126        """
1127        This function returns the header list of a VCF
1128
1129        :return: The length of the header list.
1130        """
1131        if self.get_header():
1132            return self.get_header_columns().strip().split("\t")
1133        else:
1134            return []

This function returns the header list of a VCF

Returns

The length of the header list.

def get_header_columns_as_sql(self) -> str:
1136    def get_header_columns_as_sql(self) -> str:
1137        """
1138        This function retruns header length (without #CHROM line)
1139
1140        :return: The length of the header list.
1141        """
1142        sql_column_list = []
1143        for col in self.get_header_columns_as_list():
1144            sql_column_list.append(f'"{col}"')
1145        return ",".join(sql_column_list)

This function retruns header length (without #CHROM line)

Returns

The length of the header list.

def get_header_sample_list( self, check: bool = False, samples: list = None, samples_force: bool = False) -> list:
1147    def get_header_sample_list(
1148        self, check: bool = False, samples: list = None, samples_force: bool = False
1149    ) -> list:
1150        """
1151        The function `get_header_sample_list` returns a list of samples from a VCF header, with optional
1152        checking and filtering based on input parameters.
1153
1154        :param check: The `check` parameter in the `get_header_sample_list` function is a boolean
1155        parameter that determines whether to check if the samples in the list are properly defined as
1156        genotype columns. If `check` is set to `True`, the function will verify if each sample in the
1157        list is defined as a, defaults to False
1158        :type check: bool (optional)
1159        :param samples: The `samples` parameter in the `get_header_sample_list` function is a list that
1160        allows you to specify a subset of samples from the header. If you provide a list of sample
1161        names, the function will check if each sample is defined in the header. If a sample is not found
1162        in the
1163        :type samples: list
1164        :param samples_force: The `samples_force` parameter in the `get_header_sample_list` function is
1165        a boolean parameter that determines whether to force the function to return the sample list
1166        without checking if the samples are genotype columns. If `samples_force` is set to `True`, the
1167        function will return the sample list without performing, defaults to False
1168        :type samples_force: bool (optional)
1169        :return: The function `get_header_sample_list` returns a list of samples based on the input
1170        parameters and conditions specified in the function.
1171        """
1172
1173        # Init
1174        samples_list = []
1175
1176        if samples is None:
1177            samples_list = self.header_vcf.samples
1178        else:
1179            samples_checked = []
1180            for sample in samples:
1181                if sample in self.header_vcf.samples:
1182                    samples_checked.append(sample)
1183                else:
1184                    log.warning(f"Sample '{sample}' not defined in header")
1185            samples_list = samples_checked
1186
1187            # Force sample list without checking if is_genotype_column
1188            if samples_force:
1189                log.warning(f"Samples {samples_list} not checked if genotypes")
1190                return samples_list
1191
1192        if check:
1193            samples_checked = []
1194            for sample in samples_list:
1195                if self.is_genotype_column(column=sample):
1196                    samples_checked.append(sample)
1197                else:
1198                    log.warning(
1199                        f"Sample '{sample}' not defined as a sample (genotype not well defined)"
1200                    )
1201            samples_list = samples_checked
1202
1203        # Return samples list
1204        return samples_list

The function get_header_sample_list returns a list of samples from a VCF header, with optional checking and filtering based on input parameters.

Parameters
  • check: The check parameter in the get_header_sample_list function is a boolean parameter that determines whether to check if the samples in the list are properly defined as genotype columns. If check is set to True, the function will verify if each sample in the list is defined as a, defaults to False
  • samples: The samples parameter in the get_header_sample_list function is a list that allows you to specify a subset of samples from the header. If you provide a list of sample names, the function will check if each sample is defined in the header. If a sample is not found in the
  • samples_force: The samples_force parameter in the get_header_sample_list function is a boolean parameter that determines whether to force the function to return the sample list without checking if the samples are genotype columns. If samples_force is set to True, the function will return the sample list without performing, defaults to False
Returns

The function get_header_sample_list returns a list of samples based on the input parameters and conditions specified in the function.

def is_genotype_column(self, column: str = None) -> bool:
1206    def is_genotype_column(self, column: str = None) -> bool:
1207        """
1208        This function checks if a given column is a genotype column in a database.
1209
1210        :param column: The `column` parameter in the `is_genotype_column` method is a string that
1211        represents the column name in a database table. This method checks if the specified column is a
1212        genotype column in the database. If a column name is provided, it calls the `is_genotype_column`
1213        method of
1214        :type column: str
1215        :return: The `is_genotype_column` method is returning a boolean value. If the `column` parameter
1216        is not None, it calls the `is_genotype_column` method of the `Database` class with the specified
1217        column name and returns the result. If the `column` parameter is None, it returns False.
1218        """
1219
1220        if column is not None:
1221            return Database(database=self.get_input()).is_genotype_column(column=column)
1222        else:
1223            return False

This function checks if a given column is a genotype column in a database.

Parameters
  • column: The column parameter in the is_genotype_column method is a string that represents the column name in a database table. This method checks if the specified column is a genotype column in the database. If a column name is provided, it calls the is_genotype_column method of
Returns

The is_genotype_column method is returning a boolean value. If the column parameter is not None, it calls the is_genotype_column method of the Database class with the specified column name and returns the result. If the column parameter is None, it returns False.

def get_verbose(self) -> bool:
1225    def get_verbose(self) -> bool:
1226        """
1227        It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't
1228        exist
1229
1230        :return: The value of the key "verbose" in the config dictionary.
1231        """
1232        return self.get_config().get("verbose", False)

It returns the value of the "verbose" key in the config dictionary, or False if the key doesn't exist

Returns

The value of the key "verbose" in the config dictionary.

def get_connexion_format(self) -> str:
1234    def get_connexion_format(self) -> str:
1235        """
1236        It returns the connexion format of the object.
1237        :return: The connexion_format is being returned.
1238        """
1239        connexion_format = self.connexion_format
1240        if connexion_format not in ["duckdb", "sqlite"]:
1241            log.error(f"Unknown connexion format {connexion_format}")
1242            raise ValueError(f"Unknown connexion format {connexion_format}")
1243        else:
1244            return connexion_format

It returns the connexion format of the object.

Returns

The connexion_format is being returned.

def insert_file_to_table( self, file, columns: str, header_len: int = 0, sep: str = '\t', chunksize: int = 1000000) -> None:
1246    def insert_file_to_table(
1247        self,
1248        file,
1249        columns: str,
1250        header_len: int = 0,
1251        sep: str = "\t",
1252        chunksize: int = 1000000,
1253    ) -> None:
1254        """
1255        The function reads a file in chunks and inserts each chunk into a table based on the specified
1256        database format.
1257
1258        :param file: The `file` parameter is the file that you want to load into a table. It should be
1259        the path to the file on your system
1260        :param columns: The `columns` parameter in the `insert_file_to_table` function is a string that
1261        should contain the names of the columns in the table where the data will be inserted. The column
1262        names should be separated by commas within the string. For example, if you have columns named
1263        "id", "name
1264        :type columns: str
1265        :param header_len: The `header_len` parameter in the `insert_file_to_table` function specifies
1266        the number of lines to skip at the beginning of the file before reading the actual data. This
1267        parameter allows you to skip any header information present in the file before processing the
1268        data, defaults to 0
1269        :type header_len: int (optional)
1270        :param sep: The `sep` parameter in the `insert_file_to_table` function is used to specify the
1271        separator character that is used in the file being read. In this case, the default separator is
1272        set to `\t`, which represents a tab character. You can change this parameter to a different
1273        separator character if, defaults to \t
1274        :type sep: str (optional)
1275        :param chunksize: The `chunksize` parameter specifies the number of rows to read in at a time
1276        when processing the file in chunks. In the provided code snippet, the default value for
1277        `chunksize` is set to 1000000. This means that the file will be read in chunks of 1,, defaults
1278        to 1000000
1279        :type chunksize: int (optional)
1280        """
1281
1282        # Config
1283        chunksize = self.get_config().get("load", {}).get("chunk", chunksize)
1284        connexion_format = self.get_connexion_format()
1285
1286        log.debug("chunksize: " + str(chunksize))
1287
1288        if chunksize:
1289            for chunk in pd.read_csv(
1290                file, skiprows=header_len, sep=sep, chunksize=chunksize, engine="c"
1291            ):
1292                if connexion_format in ["duckdb"]:
1293                    sql_insert_into = (
1294                        f"INSERT INTO variants ({columns}) SELECT {columns} FROM chunk"
1295                    )
1296                    self.conn.execute(sql_insert_into)
1297                elif connexion_format in ["sqlite"]:
1298                    chunk.to_sql("variants", self.conn, if_exists="append", index=False)

The function reads a file in chunks and inserts each chunk into a table based on the specified database format.

Parameters
  • file: The file parameter is the file that you want to load into a table. It should be the path to the file on your system
  • columns: The columns parameter in the insert_file_to_table function is a string that should contain the names of the columns in the table where the data will be inserted. The column names should be separated by commas within the string. For example, if you have columns named "id", "name
  • header_len: The header_len parameter in the insert_file_to_table function specifies the number of lines to skip at the beginning of the file before reading the actual data. This parameter allows you to skip any header information present in the file before processing the data, defaults to 0
  • sep: The sep parameter in the insert_file_to_table function is used to specify the separator character that is used in the file being read. In this case, the default separator is set to , which represents a tab character. You can change this parameter to a different separator character if, defaults to
  • chunksize: The chunksize parameter specifies the number of rows to read in at a time when processing the file in chunks. In the provided code snippet, the default value for chunksize is set to 1000000. This means that the file will be read in chunks of 1,, defaults to 1000000
def load_data( self, input_file: str = None, drop_variants_table: bool = False, sample_size: int = 20480) -> None:
1300    def load_data(
1301        self,
1302        input_file: str = None,
1303        drop_variants_table: bool = False,
1304        sample_size: int = 20480,
1305    ) -> None:
1306        """
1307        The `load_data` function reads a VCF file and inserts it into a table, with options to drop the
1308        table before loading the data and specify a sample size.
1309
1310        :param input_file: The path to the input file. This is the VCF file that will be loaded into the
1311        table
1312        :type input_file: str
1313        :param drop_variants_table: The `drop_variants_table` parameter is a boolean flag that
1314        determines whether the variants table should be dropped before loading the data. If set to
1315        `True`, the variants table will be dropped. If set to `False` (default), the variants table will
1316        not be dropped, defaults to False
1317        :type drop_variants_table: bool (optional)
1318        :param sample_size: The `sample_size` parameter determines the number of rows to be sampled from
1319        the input file. If it is set to `None`, the default value of 20480 will be used, defaults to
1320        20480
1321        :type sample_size: int (optional)
1322        """
1323
1324        log.info("Loading...")
1325
1326        # change input file
1327        if input_file:
1328            self.set_input(input_file)
1329            self.set_header()
1330
1331        # drop variants table
1332        if drop_variants_table:
1333            self.drop_variants_table()
1334
1335        # get table variants
1336        table_variants = self.get_table_variants()
1337
1338        # Access
1339        access = self.get_config().get("access", None)
1340        log.debug(f"access: {access}")
1341
1342        # Input format and compress
1343        input_format = self.get_input_format()
1344        input_compressed = self.get_input_compressed()
1345        log.debug(f"input_format: {input_format}")
1346        log.debug(f"input_compressed: {input_compressed}")
1347
1348        # input_compressed_format
1349        if input_compressed:
1350            input_compressed_format = "gzip"
1351        else:
1352            input_compressed_format = "none"
1353        log.debug(f"input_compressed_format: {input_compressed_format}")
1354
1355        # Connexion format
1356        connexion_format = self.get_connexion_format()
1357
1358        # Sample size
1359        if not sample_size:
1360            sample_size = -1
1361        log.debug(f"sample_size: {sample_size}")
1362
1363        # Load data
1364        log.debug(f"Load Data from {input_format}")
1365
1366        # DuckDB connexion
1367        if connexion_format in ["duckdb"]:
1368
1369            # Database already exists
1370            if self.input_format in ["db", "duckdb"]:
1371
1372                if connexion_format in ["duckdb"]:
1373                    log.debug(f"Input file format '{self.input_format}' duckDB")
1374                else:
1375                    log.error(
1376                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1377                    )
1378                    raise ValueError(
1379                        f"Input file format '{self.input_format}' not compatilbe with database format '{connexion_format}'"
1380                    )
1381
1382            # Load from existing database format
1383            else:
1384
1385                try:
1386                    # Create Table or View
1387                    database = Database(database=self.input)
1388                    sql_from = database.get_sql_from(sample_size=sample_size)
1389
1390                    if access in ["RO"]:
1391                        sql_load = (
1392                            f"CREATE VIEW {table_variants} AS SELECT * FROM {sql_from}"
1393                        )
1394                    else:
1395                        sql_load = (
1396                            f"CREATE TABLE {table_variants} AS SELECT * FROM {sql_from}"
1397                        )
1398                    self.conn.execute(sql_load)
1399
1400                except:
1401                    # Format not available
1402                    log.error(f"Input file format '{self.input_format}' not available")
1403                    raise ValueError(
1404                        f"Input file format '{self.input_format}' not available"
1405                    )
1406
1407        # SQLite connexion
1408        elif connexion_format in ["sqlite"] and input_format in [
1409            "vcf",
1410            "tsv",
1411            "csv",
1412            "psv",
1413        ]:
1414
1415            # Main structure
1416            structure = {
1417                "#CHROM": "VARCHAR",
1418                "POS": "INTEGER",
1419                "ID": "VARCHAR",
1420                "REF": "VARCHAR",
1421                "ALT": "VARCHAR",
1422                "QUAL": "VARCHAR",
1423                "FILTER": "VARCHAR",
1424                "INFO": "VARCHAR",
1425            }
1426
1427            # Strcuture with samples
1428            structure_complete = structure
1429            if self.get_header_sample_list():
1430                structure["FORMAT"] = "VARCHAR"
1431                for sample in self.get_header_sample_list():
1432                    structure_complete[sample] = "VARCHAR"
1433
1434            # Columns list for create and insert
1435            sql_create_table_columns = []
1436            sql_create_table_columns_list = []
1437            for column in structure_complete:
1438                column_type = structure_complete[column]
1439                sql_create_table_columns.append(
1440                    f'"{column}" {column_type} default NULL'
1441                )
1442                sql_create_table_columns_list.append(f'"{column}"')
1443
1444            # Create database
1445            log.debug(f"Create Table {table_variants}")
1446            sql_create_table_columns_sql = ", ".join(sql_create_table_columns)
1447            sql_create_table_columns_list_sql = ", ".join(sql_create_table_columns_list)
1448            sql_create_table = f"CREATE TABLE IF NOT EXISTS {table_variants} ({sql_create_table_columns_sql})"
1449            self.conn.execute(sql_create_table)
1450
1451            # chunksize define length of file chunk load file
1452            chunksize = 100000
1453
1454            # delimiter
1455            delimiter = file_format_delimiters.get(input_format, "\t")
1456
1457            # Load the input file
1458            with open(self.input, "rt") as input_file:
1459
1460                # Use the appropriate file handler based on the input format
1461                if input_compressed:
1462                    input_file = bgzf.open(self.input, "rt")
1463                if input_format in ["vcf"]:
1464                    header_len = self.get_header_length()
1465                else:
1466                    header_len = 0
1467
1468                # Insert the file contents into a table
1469                self.insert_file_to_table(
1470                    input_file,
1471                    columns=sql_create_table_columns_list_sql,
1472                    header_len=header_len,
1473                    sep=delimiter,
1474                    chunksize=chunksize,
1475                )
1476
1477        else:
1478            log.error(
1479                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1480            )
1481            raise ValueError(
1482                f"Connexion format '{connexion_format}' not available with format '{input_format}'"
1483            )
1484
1485        # Explode INFOS fields into table fields
1486        if self.get_explode_infos():
1487            self.explode_infos(
1488                prefix=self.get_explode_infos_prefix(),
1489                fields=self.get_explode_infos_fields(),
1490                force=True,
1491            )
1492
1493        # Create index after insertion
1494        self.create_indexes()

The load_data function reads a VCF file and inserts it into a table, with options to drop the table before loading the data and specify a sample size.

Parameters
  • input_file: The path to the input file. This is the VCF file that will be loaded into the table
  • drop_variants_table: The drop_variants_table parameter is a boolean flag that determines whether the variants table should be dropped before loading the data. If set to True, the variants table will be dropped. If set to False (default), the variants table will not be dropped, defaults to False
  • sample_size: The sample_size parameter determines the number of rows to be sampled from the input file. If it is set to None, the default value of 20480 will be used, defaults to 20480
def get_explode_infos(self) -> bool:
1496    def get_explode_infos(self) -> bool:
1497        """
1498        The function `get_explode_infos` returns the value of the "explode_infos" parameter, defaulting
1499        to False if it is not set.
1500        :return: The method is returning the value of the "explode_infos" parameter, which is a boolean
1501        value. If the parameter is not present, it will return False.
1502        """
1503
1504        return self.get_param().get("explode", {}).get("explode_infos", False)

The function get_explode_infos returns the value of the "explode_infos" parameter, defaulting to False if it is not set.

Returns

The method is returning the value of the "explode_infos" parameter, which is a boolean value. If the parameter is not present, it will return False.

def get_explode_infos_fields( self, explode_infos_fields: str = None, remove_fields_not_in_header: bool = False) -> list:
1506    def get_explode_infos_fields(
1507        self,
1508        explode_infos_fields: str = None,
1509        remove_fields_not_in_header: bool = False,
1510    ) -> list:
1511        """
1512        The `get_explode_infos_fields` function returns a list of exploded information fields based on
1513        the input parameter `explode_infos_fields`.
1514
1515        :param explode_infos_fields: The `explode_infos_fields` parameter is a string that specifies the
1516        fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a
1517        comma-separated list of field names to explode
1518        :type explode_infos_fields: str
1519        :param remove_fields_not_in_header: The parameter `remove_fields_not_in_header` is a boolean
1520        flag that determines whether to remove fields that are not present in the header. If it is set
1521        to `True`, any field that is not in the header will be excluded from the list of exploded
1522        information fields. If it is set to `, defaults to False
1523        :type remove_fields_not_in_header: bool (optional)
1524        :return: The function `get_explode_infos_fields` returns a list of exploded information fields.
1525        If the `explode_infos_fields` parameter is not provided or is set to None, it returns an empty
1526        list. If the parameter is provided and its value is "ALL", it also returns an empty list.
1527        Otherwise, it returns a list of exploded information fields after removing any spaces and
1528        splitting the string by commas.
1529        """
1530
1531        # If no fields, get it in param
1532        if not explode_infos_fields:
1533            explode_infos_fields = (
1534                self.get_param().get("explode", {}).get("explode_infos_fields", None)
1535            )
1536
1537        # If no fields, defined as all fields in header using keyword
1538        if not explode_infos_fields:
1539            explode_infos_fields = "*"
1540
1541        # If fields list not empty
1542        if explode_infos_fields:
1543
1544            # Input fields list
1545            if isinstance(explode_infos_fields, str):
1546                fields_input = explode_infos_fields.split(",")
1547            elif isinstance(explode_infos_fields, list):
1548                fields_input = explode_infos_fields
1549            else:
1550                fields_input = []
1551
1552            # Fields list without * keyword
1553            fields_without_all = fields_input.copy()
1554            if "*".casefold() in (item.casefold() for item in fields_without_all):
1555                fields_without_all.remove("*")
1556
1557            # Fields in header
1558            fields_in_header = sorted(list(set(self.get_header().infos)))
1559
1560            # Construct list of fields
1561            fields_output = []
1562            for field in fields_input:
1563
1564                # Strip field
1565                field = field.strip()
1566
1567                # format keyword * in regex
1568                if field.upper() in ["*"]:
1569                    field = ".*"
1570
1571                # Find all fields with pattern
1572                r = re.compile(field)
1573                fields_search = sorted(list(filter(r.match, fields_in_header)))
1574
1575                # Remove fields input from search
1576                if field in fields_search:
1577                    fields_search = [field]
1578                elif fields_search != [field]:
1579                    fields_search = sorted(
1580                        list(set(fields_search).difference(fields_input))
1581                    )
1582
1583                # If field is not in header (avoid not well formatted header)
1584                if not fields_search and not remove_fields_not_in_header:
1585                    fields_search = [field]
1586
1587                # Add found fields
1588                for new_field in fields_search:
1589                    # Add field, if not already exists, and if it is in header (if asked)
1590                    if (
1591                        new_field not in fields_output
1592                        and (
1593                            not remove_fields_not_in_header
1594                            or new_field in fields_in_header
1595                        )
1596                        and new_field not in [".*"]
1597                    ):
1598                        fields_output.append(new_field)
1599
1600            return fields_output
1601
1602        else:
1603
1604            return []

The get_explode_infos_fields function returns a list of exploded information fields based on the input parameter explode_infos_fields.

Parameters
  • explode_infos_fields: The explode_infos_fields parameter is a string that specifies the fields to be exploded. It can be set to "ALL" to explode all fields, or it can be a comma-separated list of field names to explode
  • remove_fields_not_in_header: The parameter remove_fields_not_in_header is a boolean flag that determines whether to remove fields that are not present in the header. If it is set to True, any field that is not in the header will be excluded from the list of exploded information fields. If it is set to `, defaults to False
Returns

The function get_explode_infos_fields returns a list of exploded information fields. If the explode_infos_fields parameter is not provided or is set to None, it returns an empty list. If the parameter is provided and its value is "ALL", it also returns an empty list. Otherwise, it returns a list of exploded information fields after removing any spaces and splitting the string by commas.

def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1606    def get_explode_infos_prefix(self, explode_infos_prefix: str = None) -> str:
1607        """
1608        The function `get_explode_infos_prefix` returns the value of the `explode_infos_prefix` parameter, or
1609        the value of `self.get_param().get("explode_infos_prefix", None)` if `explode_infos_prefix` is
1610        not provided.
1611
1612        :param explode_infos_prefix: The parameter `explode_infos_prefix` is a string that specifies a
1613        prefix to be used for exploding or expanding information
1614        :type explode_infos_prefix: str
1615        :return: the value of the variable `explode_infos_prefix`.
1616        """
1617
1618        if not explode_infos_prefix:
1619            explode_infos_prefix = (
1620                self.get_param().get("explode", {}).get("explode_infos_prefix", "")
1621            )
1622
1623        return explode_infos_prefix

The function get_explode_infos_prefix returns the value of the explode_infos_prefix parameter, or the value of self.get_param().get("explode_infos_prefix", None) if explode_infos_prefix is not provided.

Parameters
  • explode_infos_prefix: The parameter explode_infos_prefix is a string that specifies a prefix to be used for exploding or expanding information
Returns

the value of the variable explode_infos_prefix.

def add_column( self, table_name, column_name, column_type, default_value=None, drop: bool = False) -> dict:
1625    def add_column(
1626        self,
1627        table_name,
1628        column_name,
1629        column_type,
1630        default_value=None,
1631        drop: bool = False,
1632    ) -> dict:
1633        """
1634        The `add_column` function adds a column to a SQLite or DuckDB table with a default value if it
1635        doesn't already exist.
1636
1637        :param table_name: The name of the table to which you want to add a column
1638        :param column_name: The parameter "column_name" is the name of the column that you want to add
1639        to the table
1640        :param column_type: The `column_type` parameter specifies the data type of the column that you
1641        want to add to the table. It should be a string that represents the desired data type, such as
1642        "INTEGER", "TEXT", "REAL", etc
1643        :param default_value: The `default_value` parameter is an optional parameter that specifies the
1644        default value for the newly added column. If a default value is provided, it will be assigned to
1645        the column for any existing rows that do not have a value for that column
1646        :param drop: The `drop` parameter is a boolean flag that determines whether to drop the column
1647        if it already exists in the table. If `drop` is set to `True`, the function will drop the
1648        existing column before adding the new column. If `drop` is set to `False` (default),, defaults
1649        to False
1650        :type drop: bool (optional)
1651        :return: a boolean value indicating whether the column was successfully added to the table.
1652        """
1653
1654        # added
1655        added = False
1656        dropped = False
1657
1658        # Check if the column already exists in the table
1659        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1660        columns = self.get_query_to_df(query).columns.tolist()
1661        if column_name.upper() in [c.upper() for c in columns]:
1662            log.debug(
1663                f"The {column_name} column already exists in the {table_name} table"
1664            )
1665            if drop:
1666                self.drop_column(table_name=table_name, column_name=column_name)
1667                dropped = True
1668            else:
1669                return None
1670        else:
1671            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1672
1673        # Add column in table
1674        add_column_query = (
1675            f""" ALTER TABLE {table_name} ADD COLUMN "{column_name}" {column_type} """
1676        )
1677        if default_value is not None:
1678            add_column_query += f" DEFAULT {default_value}"
1679        self.execute_query(add_column_query)
1680        added = not dropped
1681        log.debug(
1682            f"The {column_name} column was successfully added to the {table_name} table"
1683        )
1684
1685        if added:
1686            added_column = {
1687                "table_name": table_name,
1688                "column_name": column_name,
1689                "column_type": column_type,
1690                "default_value": default_value,
1691            }
1692        else:
1693            added_column = None
1694
1695        return added_column

The add_column function adds a column to a SQLite or DuckDB table with a default value if it doesn't already exist.

Parameters
  • table_name: The name of the table to which you want to add a column
  • column_name: The parameter "column_name" is the name of the column that you want to add to the table
  • column_type: The column_type parameter specifies the data type of the column that you want to add to the table. It should be a string that represents the desired data type, such as "INTEGER", "TEXT", "REAL", etc
  • default_value: The default_value parameter is an optional parameter that specifies the default value for the newly added column. If a default value is provided, it will be assigned to the column for any existing rows that do not have a value for that column
  • drop: The drop parameter is a boolean flag that determines whether to drop the column if it already exists in the table. If drop is set to True, the function will drop the existing column before adding the new column. If drop is set to False (default),, defaults to False
Returns

a boolean value indicating whether the column was successfully added to the table.

def drop_column( self, column: dict = None, table_name: str = None, column_name: str = None) -> bool:
1697    def drop_column(
1698        self, column: dict = None, table_name: str = None, column_name: str = None
1699    ) -> bool:
1700        """
1701        The `drop_column` function drops a specified column from a given table in a database and returns
1702        True if the column was successfully dropped, and False if the column does not exist in the
1703        table.
1704
1705        :param column: The `column` parameter is a dictionary that contains information about the column
1706        you want to drop. It has two keys:
1707        :type column: dict
1708        :param table_name: The `table_name` parameter is the name of the table from which you want to
1709        drop a column
1710        :type table_name: str
1711        :param column_name: The `column_name` parameter is the name of the column that you want to drop
1712        from the table
1713        :type column_name: str
1714        :return: a boolean value. It returns True if the column was successfully dropped from the table,
1715        and False if the column does not exist in the table.
1716        """
1717
1718        # Find column infos
1719        if column:
1720            if isinstance(column, dict):
1721                table_name = column.get("table_name", None)
1722                column_name = column.get("column_name", None)
1723            elif isinstance(column, str):
1724                table_name = self.get_table_variants()
1725                column_name = column
1726            else:
1727                table_name = None
1728                column_name = None
1729
1730        if not table_name and not column_name:
1731            return False
1732
1733        # Removed
1734        removed = False
1735
1736        # Check if the column already exists in the table
1737        query = f""" SELECT * FROM {table_name} LIMIT 0 """
1738        columns = self.get_query_to_df(query).columns.tolist()
1739        if column_name in columns:
1740            log.debug(f"The {column_name} column exists in the {table_name} table")
1741        else:
1742            log.debug(f"The {column_name} column NOT exists in the {table_name} table")
1743            return False
1744
1745        # Add column in table # ALTER TABLE integers DROP k
1746        add_column_query = f""" ALTER TABLE {table_name} DROP "{column_name}" """
1747        self.execute_query(add_column_query)
1748        removed = True
1749        log.debug(
1750            f"The {column_name} column was successfully dropped to the {table_name} table"
1751        )
1752
1753        return removed

The drop_column function drops a specified column from a given table in a database and returns True if the column was successfully dropped, and False if the column does not exist in the table.

Parameters
  • column: The column parameter is a dictionary that contains information about the column you want to drop. It has two keys:
  • table_name: The table_name parameter is the name of the table from which you want to drop a column
  • column_name: The column_name parameter is the name of the column that you want to drop from the table
Returns

a boolean value. It returns True if the column was successfully dropped from the table, and False if the column does not exist in the table.

def explode_infos( self, prefix: str = None, create_index: bool = False, fields: list = None, force: bool = False, proccess_all_fields_together: bool = False, table: str = None) -> list:
1755    def explode_infos(
1756        self,
1757        prefix: str = None,
1758        create_index: bool = False,
1759        fields: list = None,
1760        force: bool = False,
1761        proccess_all_fields_together: bool = False,
1762        table: str = None,
1763    ) -> list:
1764        """
1765        The `explode_infos` function in Python takes a VCF file and explodes the INFO fields into
1766        individual columns, returning a list of added columns.
1767
1768        :param prefix: The `prefix` parameter is a string that is used as a prefix for the exploded INFO
1769        fields. If the `prefix` is not provided or is set to `None`, the function will use the value of
1770        `self.get_explode_infos_prefix()` as the prefix
1771        :type prefix: str
1772        :param create_index: The `create_index` parameter is a boolean flag that specifies whether to
1773        create indexes on the exploded INFO fields. If set to `True`, indexes will be created; if set to
1774        `False`, indexes will not be created. The default value is `False`, defaults to False
1775        :type create_index: bool (optional)
1776        :param fields: The `fields` parameter in the `explode_infos` function is a list of INFO fields
1777        that you want to explode into individual columns. If this parameter is not provided, all INFO
1778        fields will be exploded. You can specify the INFO fields you want to explode by passing them as
1779        a list to the `
1780        :type fields: list
1781        :param force: The `force` parameter in the `explode_infos` function is a boolean flag that
1782        determines whether to drop and recreate a column if it already exists in the table. If `force`
1783        is set to `True`, the column will be dropped and recreated. If `force` is set to `False,
1784        defaults to False
1785        :type force: bool (optional)
1786        :param proccess_all_fields_together: The `proccess_all_fields_together` parameter is a boolean
1787        flag that determines whether to process all the INFO fields together or individually. If set to
1788        `True`, all the INFO fields will be processed together. If set to `False`, each INFO field will
1789        be processed individually. The default value is, defaults to False
1790        :type proccess_all_fields_together: bool (optional)
1791        :param table: The `table` parameter in the `explode_infos` function is used to specify the name
1792        of the table where the exploded INFO fields will be added as individual columns. If you provide
1793        a value for the `table` parameter, the function will use that table name. If the `table`
1794        parameter is
1795        :type table: str
1796        :return: The `explode_infos` function returns a list of added columns.
1797        """
1798
1799        # drop indexes
1800        self.drop_indexes()
1801
1802        # connexion format
1803        connexion_format = self.get_connexion_format()
1804
1805        # Access
1806        access = self.get_config().get("access", None)
1807
1808        # Added columns
1809        added_columns = []
1810
1811        if access not in ["RO"]:
1812
1813            # prefix
1814            if prefix in [None, True] or not isinstance(prefix, str):
1815                if self.get_explode_infos_prefix() not in [None, True]:
1816                    prefix = self.get_explode_infos_prefix()
1817                else:
1818                    prefix = "INFO/"
1819
1820            # table variants
1821            if table is not None:
1822                table_variants = table
1823            else:
1824                table_variants = self.get_table_variants(clause="select")
1825
1826            # extra infos
1827            try:
1828                extra_infos = self.get_extra_infos()
1829            except:
1830                extra_infos = []
1831
1832            # Header infos
1833            header_infos = self.get_header().infos
1834
1835            log.debug(
1836                f"Explode INFO fields - ADD [{len(header_infos)}] annotations fields"
1837            )
1838
1839            sql_info_alter_table_array = []
1840
1841            # Info fields to check
1842            fields_list = list(header_infos)
1843            if fields:
1844                fields_list += fields
1845            fields_list = set(fields_list)
1846
1847            # If no fields
1848            if not fields:
1849                fields = []
1850
1851            # Translate fields if patterns
1852            fields = self.get_explode_infos_fields(explode_infos_fields=fields)
1853
1854            for info in fields:
1855
1856                info_id_sql = prefix + info
1857
1858                if (
1859                    info in fields_list
1860                    or prefix + info in fields_list
1861                    or info in extra_infos
1862                ):
1863
1864                    log.debug(f"Explode INFO fields - ADD '{info}' annotations fields")
1865
1866                    if info in header_infos:
1867                        info_type = header_infos[info].type
1868                        info_num = header_infos[info].num
1869                    else:
1870                        info_type = "String"
1871                        info_num = 0
1872
1873                    type_sql = self.code_type_map_to_sql.get(info_type, "VARCHAR")
1874                    if info_num != 1:
1875                        type_sql = "VARCHAR"
1876
1877                    # Add field
1878                    added_column = self.add_column(
1879                        table_name=table_variants,
1880                        column_name=info_id_sql,
1881                        column_type=type_sql,
1882                        default_value="null",
1883                        drop=force,
1884                    )
1885
1886                    if added_column:
1887                        added_columns.append(added_column)
1888
1889                    if added_column or force:
1890
1891                        # add field to index
1892                        self.index_additionnal_fields.append(info_id_sql)
1893
1894                        # Update field array
1895                        if connexion_format in ["duckdb"]:
1896                            update_info_field = f"""
1897                            "{info_id_sql}" =
1898                                CASE
1899                                    WHEN REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1) IN ('','.') THEN NULL
1900                                    ELSE REGEXP_EXTRACT(concat(';', INFO), ';{info}=([^;]*)',1)
1901                                END
1902                            """
1903                        elif connexion_format in ["sqlite"]:
1904                            update_info_field = f"""
1905                                "{info_id_sql}" =
1906                                    CASE
1907                                        WHEN instr(INFO, '{info}=') = 0 THEN NULL
1908                                        WHEN instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';') = 0 THEN substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1)
1909                                        ELSE substr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')+1, instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}),';')-instr(substr(INFO, instr(INFO, '{info}=')+{len(info)+1}), '=')-1)
1910                                    END
1911                            """
1912
1913                        sql_info_alter_table_array.append(update_info_field)
1914
1915            if sql_info_alter_table_array:
1916
1917                # By chromosomes
1918                try:
1919                    chromosomes_list = list(
1920                        self.get_query_to_df(
1921                            f""" SELECT "#CHROM" FROM {table_variants} GROUP BY "#CHROM" """
1922                        )["#CHROM"]
1923                    )
1924                except:
1925                    chromosomes_list = [None]
1926
1927                for chrom in chromosomes_list:
1928                    log.debug(f"Explode INFO fields - Chromosome {chrom}...")
1929
1930                    # Where clause
1931                    where_clause = ""
1932                    if chrom and len(chromosomes_list) > 1:
1933                        where_clause = f""" WHERE "#CHROM" = '{chrom}' """
1934
1935                    # Update table
1936                    if proccess_all_fields_together:
1937                        sql_info_alter_table_array_join = ", ".join(
1938                            sql_info_alter_table_array
1939                        )
1940                        if sql_info_alter_table_array_join:
1941                            sql_info_alter_table = f"""
1942                                UPDATE {table_variants}
1943                                SET {sql_info_alter_table_array_join}
1944                                {where_clause}
1945                                """
1946                            log.debug(
1947                                f"Explode INFO fields - Explode all {len(sql_info_alter_table_array)} fields..."
1948                            )
1949                            # log.debug(sql_info_alter_table)
1950                            self.conn.execute(sql_info_alter_table)
1951                    else:
1952                        sql_info_alter_num = 0
1953                        for sql_info_alter in sql_info_alter_table_array:
1954                            sql_info_alter_num += 1
1955                            sql_info_alter_table = f"""
1956                                UPDATE {table_variants}
1957                                SET {sql_info_alter}
1958                                {where_clause}
1959                                """
1960                            log.debug(
1961                                f"Explode INFO fields - Explode field {sql_info_alter_num}/{len(sql_info_alter_table_array)}..."
1962                            )
1963                            # log.debug(sql_info_alter_table)
1964                            self.conn.execute(sql_info_alter_table)
1965
1966        # create indexes
1967        if create_index:
1968            self.create_indexes()
1969
1970        return added_columns

The explode_infos function in Python takes a VCF file and explodes the INFO fields into individual columns, returning a list of added columns.

Parameters
  • prefix: The prefix parameter is a string that is used as a prefix for the exploded INFO fields. If the prefix is not provided or is set to None, the function will use the value of self.get_explode_infos_prefix() as the prefix
  • create_index: The create_index parameter is a boolean flag that specifies whether to create indexes on the exploded INFO fields. If set to True, indexes will be created; if set to False, indexes will not be created. The default value is False, defaults to False
  • fields: The fields parameter in the explode_infos function is a list of INFO fields that you want to explode into individual columns. If this parameter is not provided, all INFO fields will be exploded. You can specify the INFO fields you want to explode by passing them as a list to the `
  • force: The force parameter in the explode_infos function is a boolean flag that determines whether to drop and recreate a column if it already exists in the table. If force is set to True, the column will be dropped and recreated. If force is set to `False, defaults to False
  • proccess_all_fields_together: The proccess_all_fields_together parameter is a boolean flag that determines whether to process all the INFO fields together or individually. If set to True, all the INFO fields will be processed together. If set to False, each INFO field will be processed individually. The default value is, defaults to False
  • table: The table parameter in the explode_infos function is used to specify the name of the table where the exploded INFO fields will be added as individual columns. If you provide a value for the table parameter, the function will use that table name. If the table parameter is
Returns

The explode_infos function returns a list of added columns.

def create_indexes(self) -> None:
1972    def create_indexes(self) -> None:
1973        """
1974        Create indexes on the table after insertion
1975        """
1976
1977        # Access
1978        access = self.get_config().get("access", None)
1979
1980        # get table variants
1981        table_variants = self.get_table_variants("FROM")
1982
1983        if self.get_indexing() and access not in ["RO"]:
1984            # Create index
1985            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()} ON {table_variants} ("#CHROM", "POS", "REF", "ALT")'
1986            self.conn.execute(sql_create_table_index)
1987            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_chrom ON {table_variants} ("#CHROM")'
1988            self.conn.execute(sql_create_table_index)
1989            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_pos ON {table_variants} ("POS")'
1990            self.conn.execute(sql_create_table_index)
1991            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_ref ON {table_variants} ( "REF")'
1992            self.conn.execute(sql_create_table_index)
1993            sql_create_table_index = f'CREATE INDEX IF NOT EXISTS idx_{self.get_table_variants()}_alt ON {table_variants} ("ALT")'
1994            self.conn.execute(sql_create_table_index)
1995            for field in self.index_additionnal_fields:
1996                sql_create_table_index = f""" CREATE INDEX IF NOT EXISTS "idx_{self.get_table_variants()}_{field}" ON {table_variants} ("{field}") """
1997                self.conn.execute(sql_create_table_index)

Create indexes on the table after insertion

def drop_indexes(self) -> None:
1999    def drop_indexes(self) -> None:
2000        """
2001        Create indexes on the table after insertion
2002        """
2003
2004        # Access
2005        access = self.get_config().get("access", None)
2006
2007        # get table variants
2008        table_variants = self.get_table_variants("FROM")
2009
2010        # Get database format
2011        connexion_format = self.get_connexion_format()
2012
2013        if access not in ["RO"]:
2014            if connexion_format in ["duckdb"]:
2015                sql_list_indexes = f"SELECT index_name FROM duckdb_indexes WHERE table_name='{table_variants}'"
2016            elif connexion_format in ["sqlite"]:
2017                sql_list_indexes = f"SELECT name FROM sqlite_master WHERE type='index' AND tbl_name='{table_variants}';"
2018
2019            list_indexes = self.conn.execute(sql_list_indexes)
2020            index_names = [row[0] for row in list_indexes.fetchall()]
2021            for index in index_names:
2022                sql_drop_table_index = f""" DROP INDEX IF EXISTS "{index}" """
2023                self.conn.execute(sql_drop_table_index)

Create indexes on the table after insertion

def read_vcf_header(self, f) -> list:
2025    def read_vcf_header(self, f) -> list:
2026        """
2027        It reads the header of a VCF file and returns a list of the header lines
2028
2029        :param f: the file object
2030        :return: The header lines of the VCF file.
2031        """
2032
2033        header_list = []
2034        for line in f:
2035            header_list.append(line)
2036            if line.startswith("#CHROM"):
2037                break
2038        return header_list

It reads the header of a VCF file and returns a list of the header lines

Parameters
  • f: the file object
Returns

The header lines of the VCF file.

def read_vcf_header_file(self, file: str = None) -> list:
2040    def read_vcf_header_file(self, file: str = None) -> list:
2041        """
2042        The `read_vcf_header_file` function reads the header of a VCF file, handling both compressed and
2043        uncompressed files.
2044
2045        :param file: The `file` parameter is a string that represents the path to the VCF header file
2046        that you want to read. It is an optional parameter, so if you don't provide a value, it will
2047        default to `None`
2048        :type file: str
2049        :return: The function `read_vcf_header_file` returns a list.
2050        """
2051
2052        if self.get_input_compressed(input_file=file):
2053            with bgzf.open(file, "rt") as f:
2054                return self.read_vcf_header(f=f)
2055        else:
2056            with open(file, "rt") as f:
2057                return self.read_vcf_header(f=f)

The read_vcf_header_file function reads the header of a VCF file, handling both compressed and uncompressed files.

Parameters
  • file: The file parameter is a string that represents the path to the VCF header file that you want to read. It is an optional parameter, so if you don't provide a value, it will default to None
Returns

The function read_vcf_header_file returns a list.

def execute_query(self, query: str):
2059    def execute_query(self, query: str):
2060        """
2061        It takes a query as an argument, executes it, and returns the results
2062
2063        :param query: The query to be executed
2064        :return: The result of the query is being returned.
2065        """
2066        if query:
2067            return self.conn.execute(query)  # .fetchall()
2068        else:
2069            return None

It takes a query as an argument, executes it, and returns the results

Parameters
  • query: The query to be executed
Returns

The result of the query is being returned.

def export_output( self, output_file: str | None = None, output_header: str | None = None, export_header: bool = True, query: str | None = None, parquet_partitions: list | None = None, chunk_size: int | None = None, threads: int | None = None, sort: bool = False, index: bool = False, order_by: str | None = None) -> bool:
2071    def export_output(
2072        self,
2073        output_file: str | None = None,
2074        output_header: str | None = None,
2075        export_header: bool = True,
2076        query: str | None = None,
2077        parquet_partitions: list | None = None,
2078        chunk_size: int | None = None,
2079        threads: int | None = None,
2080        sort: bool = False,
2081        index: bool = False,
2082        order_by: str | None = None,
2083    ) -> bool:
2084        """
2085        The `export_output` function exports data from a VCF file to a specified output file in various
2086        formats, including VCF, CSV, TSV, PSV, and Parquet.
2087
2088        :param output_file: The `output_file` parameter is a string that specifies the name of the
2089        output file to be generated by the function. This is where the exported data will be saved
2090        :type output_file: str
2091        :param output_header: The `output_header` parameter is a string that specifies the name of the
2092        file where the header of the VCF file will be exported. If this parameter is not provided, the
2093        header will be exported to a file with the same name as the `output_file` parameter, but with
2094        the extension "
2095        :type output_header: str
2096        :param export_header: The `export_header` parameter is a boolean flag that determines whether
2097        the header of a VCF file should be exported to a separate file or not. If `export_header` is
2098        True, the header will be exported to a file. If `export_header` is False, the header will not
2099        be, defaults to True, if output format is not VCF
2100        :type export_header: bool (optional)
2101        :param query: The `query` parameter is an optional SQL query that can be used to filter and
2102        select specific data from the VCF file before exporting it. If provided, only the data that
2103        matches the query will be exported
2104        :type query: str
2105        :param parquet_partitions: The `parquet_partitions` parameter is a list that specifies the
2106        columns to be used for partitioning the Parquet file during export. Partitioning is a way to
2107        organize data in a hierarchical directory structure based on the values of one or more columns.
2108        This can improve query performance when working with large datasets
2109        :type parquet_partitions: list
2110        :param chunk_size: The `chunk_size` parameter specifies the number of
2111        records in batch when exporting data in Parquet format. This parameter is used for
2112        partitioning the Parquet file into multiple files.
2113        :type chunk_size: int
2114        :param threads: The `threads` parameter is an optional parameter that specifies the number of
2115        threads to be used during the export process. It determines the level of parallelism and can
2116        improve the performance of the export operation. If not provided, the function will use the
2117        default number of threads
2118        :type threads: int
2119        :param sort: The `sort` parameter is a boolean flag that determines whether the output file
2120        should be sorted or not. If `sort` is set to `True`, the output file will be sorted based on the
2121        genomic coordinates of the variants. By default, the value of `sort` is `False`, defaults to
2122        False
2123        :type sort: bool (optional)
2124        :param index: The `index` parameter is a boolean flag that determines whether an index should be
2125        created on the output file. If `index` is True, an index will be created. If `index` is False,
2126        no index will be created. The default value is False, defaults to False
2127        :type index: bool (optional)
2128        :param order_by: The `order_by` parameter is a string that specifies the column(s) to use for
2129        sorting the output file. This parameter is only applicable when exporting data in VCF format
2130        :type order_by: str
2131        :return: a boolean value. It checks if the output file exists and returns True if it does, or
2132        None if it doesn't.
2133        """
2134
2135        # Log
2136        log.info("Exporting...")
2137
2138        # Full path
2139        output_file = full_path(output_file)
2140        output_header = full_path(output_header)
2141
2142        # Config
2143        config = self.get_config()
2144
2145        # Param
2146        param = self.get_param()
2147
2148        # Tmp files to remove
2149        tmp_to_remove = []
2150
2151        # If no output, get it
2152        if not output_file:
2153            output_file = self.get_output()
2154
2155        # If not threads
2156        if not threads:
2157            threads = self.get_threads()
2158
2159        # Auto header name with extension
2160        if export_header or output_header:
2161            if not output_header:
2162                output_header = f"{output_file}.hdr"
2163            # Export header
2164            self.export_header(output_file=output_file)
2165
2166        # Switch off export header if VCF output
2167        output_file_type = get_file_format(output_file)
2168        if output_file_type in ["vcf"]:
2169            export_header = False
2170            tmp_to_remove.append(output_header)
2171
2172        # Chunk size
2173        if not chunk_size:
2174            chunk_size = config.get("chunk_size", None)
2175
2176        # Parquet partition
2177        if not parquet_partitions:
2178            parquet_partitions = param.get("export", {}).get("parquet_partitions", None)
2179        if parquet_partitions and isinstance(parquet_partitions, str):
2180            parquet_partitions = parquet_partitions.split(",")
2181
2182        # Order by
2183        if not order_by:
2184            order_by = param.get("export", {}).get("order_by", "")
2185
2186        # Header in output
2187        header_in_output = param.get("export", {}).get("include_header", False)
2188
2189        # Database
2190        database_source = self.get_connexion()
2191
2192        # Connexion format
2193        connexion_format = self.get_connexion_format()
2194
2195        # Explode infos
2196        if self.get_explode_infos():
2197            self.explode_infos(
2198                prefix=self.get_explode_infos_prefix(),
2199                fields=self.get_explode_infos_fields(),
2200                force=False,
2201            )
2202
2203        # if connexion_format in ["sqlite"] or query:
2204        if connexion_format in ["sqlite"]:
2205
2206            # Export in Parquet
2207            random_tmp = "".join(
2208                random.choice(string.ascii_lowercase) for i in range(10)
2209            )
2210            database_source = f"""{output_file}.{random_tmp}.database_export.parquet"""
2211            tmp_to_remove.append(database_source)
2212
2213            # Table Variants
2214            table_variants = self.get_table_variants()
2215
2216            # Create export query
2217            sql_query_export_subquery = f"""
2218                SELECT * FROM {table_variants}
2219                """
2220
2221            # Write source file
2222            fp.write(database_source, self.get_query_to_df(sql_query_export_subquery))
2223
2224        # Create database
2225        database = Database(
2226            database=database_source,
2227            table="variants",
2228            header_file=output_header,
2229            conn_config=self.get_connexion_config(),
2230        )
2231
2232        # Existing colomns header
2233        existing_columns_header = database.get_header_columns_from_database()
2234
2235        # Sample list
2236        get_samples = self.get_samples()
2237        get_samples_check = self.get_samples_check()
2238        samples_force = get_samples is not None
2239        sample_list = self.get_header_sample_list(
2240            check=get_samples_check, samples=get_samples, samples_force=samples_force
2241        )
2242
2243        # Export file
2244        database.export(
2245            output_database=output_file,
2246            output_header=output_header,
2247            existing_columns_header=existing_columns_header,
2248            parquet_partitions=parquet_partitions,
2249            chunk_size=chunk_size,
2250            threads=threads,
2251            sort=sort,
2252            index=index,
2253            header_in_output=header_in_output,
2254            order_by=order_by,
2255            query=query,
2256            export_header=export_header,
2257            sample_list=sample_list,
2258        )
2259
2260        # Remove
2261        remove_if_exists(tmp_to_remove)
2262
2263        return (os.path.exists(output_file) or None) and (
2264            os.path.exists(output_file) or None
2265        )

The export_output function exports data from a VCF file to a specified output file in various formats, including VCF, CSV, TSV, PSV, and Parquet.

Parameters
  • output_file: The output_file parameter is a string that specifies the name of the output file to be generated by the function. This is where the exported data will be saved
  • output_header: The output_header parameter is a string that specifies the name of the file where the header of the VCF file will be exported. If this parameter is not provided, the header will be exported to a file with the same name as the output_file parameter, but with the extension "
  • export_header: The export_header parameter is a boolean flag that determines whether the header of a VCF file should be exported to a separate file or not. If export_header is True, the header will be exported to a file. If export_header is False, the header will not be, defaults to True, if output format is not VCF
  • query: The query parameter is an optional SQL query that can be used to filter and select specific data from the VCF file before exporting it. If provided, only the data that matches the query will be exported
  • parquet_partitions: The parquet_partitions parameter is a list that specifies the columns to be used for partitioning the Parquet file during export. Partitioning is a way to organize data in a hierarchical directory structure based on the values of one or more columns. This can improve query performance when working with large datasets
  • chunk_size: The chunk_size parameter specifies the number of records in batch when exporting data in Parquet format. This parameter is used for partitioning the Parquet file into multiple files.
  • threads: The threads parameter is an optional parameter that specifies the number of threads to be used during the export process. It determines the level of parallelism and can improve the performance of the export operation. If not provided, the function will use the default number of threads
  • sort: The sort parameter is a boolean flag that determines whether the output file should be sorted or not. If sort is set to True, the output file will be sorted based on the genomic coordinates of the variants. By default, the value of sort is False, defaults to False
  • index: The index parameter is a boolean flag that determines whether an index should be created on the output file. If index is True, an index will be created. If index is False, no index will be created. The default value is False, defaults to False
  • order_by: The order_by parameter is a string that specifies the column(s) to use for sorting the output file. This parameter is only applicable when exporting data in VCF format
Returns

a boolean value. It checks if the output file exists and returns True if it does, or None if it doesn't.

def get_extra_infos(self, table: str = None) -> list:
2267    def get_extra_infos(self, table: str = None) -> list:
2268        """
2269        The `get_extra_infos` function returns a list of columns that are in a specified table but not
2270        in the header.
2271
2272        :param table: The `table` parameter in the `get_extra_infos` function is used to specify the
2273        name of the table from which you want to retrieve the extra columns that are not present in the
2274        header. If the `table` parameter is not provided when calling the function, it will default to
2275        using the variants
2276        :type table: str
2277        :return: A list of columns that are in the specified table but not in the header of the table.
2278        """
2279
2280        header_columns = []
2281
2282        if not table:
2283            table = self.get_table_variants(clause="from")
2284            header_columns = self.get_header_columns()
2285
2286        # Check all columns in the database
2287        query = f""" SELECT * FROM {table} LIMIT 1 """
2288        log.debug(f"query {query}")
2289        table_columns = self.get_query_to_df(query).columns.tolist()
2290        extra_columns = []
2291
2292        # Construct extra infos (not in header)
2293        for column in table_columns:
2294            if column not in header_columns:
2295                extra_columns.append(column)
2296
2297        return extra_columns

The get_extra_infos function returns a list of columns that are in a specified table but not in the header.

Parameters
  • table: The table parameter in the get_extra_infos function is used to specify the name of the table from which you want to retrieve the extra columns that are not present in the header. If the table parameter is not provided when calling the function, it will default to using the variants
Returns

A list of columns that are in the specified table but not in the header of the table.

def get_extra_infos_sql(self, table: str = None) -> str:
2299    def get_extra_infos_sql(self, table: str = None) -> str:
2300        """
2301        It returns a string of the extra infos, separated by commas, and each extra info is surrounded
2302        by double quotes
2303
2304        :param table: The name of the table to get the extra infos from. If None, the default table is
2305        used
2306        :type table: str
2307        :return: A string of the extra infos
2308        """
2309
2310        return ", ".join(
2311            ['"' + str(elem) + '"' for elem in self.get_extra_infos(table=table)]
2312        )

It returns a string of the extra infos, separated by commas, and each extra info is surrounded by double quotes

Parameters
  • table: The name of the table to get the extra infos from. If None, the default table is used
Returns

A string of the extra infos

def export_header( self, header_name: str = None, output_file: str = None, output_file_ext: str = '.hdr', clean_header: bool = True, remove_chrom_line: bool = False) -> str:
2314    def export_header(
2315        self,
2316        header_name: str = None,
2317        output_file: str = None,
2318        output_file_ext: str = ".hdr",
2319        clean_header: bool = True,
2320        remove_chrom_line: bool = False,
2321    ) -> str:
2322        """
2323        The `export_header` function takes a VCF file, extracts the header, modifies it according to
2324        specified options, and writes it to a new file.
2325
2326        :param header_name: The `header_name` parameter is the name of the header file to be created. If
2327        this parameter is not specified, the header will be written to the output file
2328        :type header_name: str
2329        :param output_file: The `output_file` parameter in the `export_header` function is used to
2330        specify the name of the output file where the header will be written. If this parameter is not
2331        provided, the header will be written to a temporary file
2332        :type output_file: str
2333        :param output_file_ext: The `output_file_ext` parameter in the `export_header` function is a
2334        string that represents the extension of the output header file. By default, it is set to ".hdr"
2335        if not specified by the user. This extension will be appended to the `output_file` name to
2336        create the final, defaults to .hdr
2337        :type output_file_ext: str (optional)
2338        :param clean_header: The `clean_header` parameter in the `export_header` function is a boolean
2339        flag that determines whether the header should be cleaned or not. When `clean_header` is set to
2340        `True`, the function will clean the header by modifying certain lines based on a specific
2341        pattern. If `clean_header`, defaults to True
2342        :type clean_header: bool (optional)
2343        :param remove_chrom_line: The `remove_chrom_line` parameter in the `export_header` function is a
2344        boolean flag that determines whether the #CHROM line should be removed from the header before
2345        writing it to the output file. If set to `True`, the #CHROM line will be removed; if set to `,
2346        defaults to False
2347        :type remove_chrom_line: bool (optional)
2348        :return: The function `export_header` returns the name of the temporary header file that is
2349        created.
2350        """
2351
2352        if not header_name and not output_file:
2353            output_file = self.get_output()
2354
2355        if self.get_header():
2356
2357            # Get header object
2358            header_obj = self.get_header()
2359
2360            # Create database
2361            db_for_header = Database(database=self.get_input())
2362
2363            # Get real columns in the file
2364            db_header_columns = db_for_header.get_columns()
2365
2366            with tempfile.TemporaryDirectory() as tmpdir:
2367
2368                # Write header file
2369                header_file_tmp = os.path.join(tmpdir, "header")
2370                f = open(header_file_tmp, "w")
2371                vcf.Writer(f, header_obj)
2372                f.close()
2373
2374                # Replace #CHROM line with rel columns
2375                header_list = db_for_header.read_header_file(
2376                    header_file=header_file_tmp
2377                )
2378                header_list[-1] = "\t".join(db_header_columns)
2379
2380                # Remove CHROM line
2381                if remove_chrom_line:
2382                    header_list.pop()
2383
2384                # Clean header
2385                if clean_header:
2386                    header_list_clean = []
2387                    for head in header_list:
2388                        # Clean head for malformed header
2389                        head_clean = head
2390                        head_clean = re.subn(
2391                            "##FORMAT=<ID=(.*),Number=(.*),Type=Flag",
2392                            r"##FORMAT=<ID=\1,Number=\2,Type=String",
2393                            head_clean,
2394                            2,
2395                        )[0]
2396                        # Write header
2397                        header_list_clean.append(head_clean)
2398                    header_list = header_list_clean
2399
2400            tmp_header_name = output_file + output_file_ext
2401
2402            f = open(tmp_header_name, "w")
2403            for line in header_list:
2404                f.write(line)
2405            f.close()
2406
2407        return tmp_header_name

The export_header function takes a VCF file, extracts the header, modifies it according to specified options, and writes it to a new file.

Parameters
  • header_name: The header_name parameter is the name of the header file to be created. If this parameter is not specified, the header will be written to the output file
  • output_file: The output_file parameter in the export_header function is used to specify the name of the output file where the header will be written. If this parameter is not provided, the header will be written to a temporary file
  • output_file_ext: The output_file_ext parameter in the export_header function is a string that represents the extension of the output header file. By default, it is set to ".hdr" if not specified by the user. This extension will be appended to the output_file name to create the final, defaults to .hdr
  • clean_header: The clean_header parameter in the export_header function is a boolean flag that determines whether the header should be cleaned or not. When clean_header is set to True, the function will clean the header by modifying certain lines based on a specific pattern. If clean_header, defaults to True
  • remove_chrom_line: The remove_chrom_line parameter in the export_header function is a boolean flag that determines whether the #CHROM line should be removed from the header before writing it to the output file. If set to True, the #CHROM line will be removed; if set to `, defaults to False
Returns

The function export_header returns the name of the temporary header file that is created.

def export_variant_vcf( self, vcf_file, remove_info: bool = False, add_samples: bool = True, list_samples: list = [], where_clause: str = '', index: bool = False, threads: int | None = None) -> bool | None:
2409    def export_variant_vcf(
2410        self,
2411        vcf_file,
2412        remove_info: bool = False,
2413        add_samples: bool = True,
2414        list_samples: list = [],
2415        where_clause: str = "",
2416        index: bool = False,
2417        threads: int | None = None,
2418    ) -> bool | None:
2419        """
2420        The `export_variant_vcf` function exports a VCF file with specified samples, allowing options to
2421        remove INFO field, add samples, and control compression and indexing.
2422
2423        :param vcf_file: The `vcf_file` parameter is the name of the file where the VCF data will be
2424        written to. It is the output file that will contain the filtered VCF data based on the specified
2425        parameters
2426        :param remove_info: The `remove_info` parameter in the `export_variant_vcf` function is a
2427        boolean flag that determines whether to remove the INFO field from the output VCF file. If set
2428        to `True`, the INFO field will be removed. If set to `False`, the INFO field will be included
2429        in, defaults to False
2430        :type remove_info: bool (optional)
2431        :param add_samples: The `add_samples` parameter is a boolean parameter that determines whether
2432        the samples should be added to the VCF file or not. If set to True, the samples will be added.
2433        If set to False, the samples will be removed. The default value is True, defaults to True
2434        :type add_samples: bool (optional)
2435        :param list_samples: The `list_samples` parameter is a list of samples that you want to include
2436        in the output VCF file. By default, all samples will be included. If you provide a list of
2437        samples, only those samples will be included in the output file
2438        :type list_samples: list
2439        :param index: The `index` parameter in the `export_variant_vcf` function is a boolean flag that
2440        determines whether or not to create an index for the output VCF file. If `index` is set to
2441        `True`, the output VCF file will be indexed using tabix. If `index`, defaults to False
2442        :type index: bool (optional)
2443        :param threads: The `threads` parameter in the `export_variant_vcf` function specifies the
2444        number of threads to use for exporting the VCF file. It determines how many parallel threads
2445        will be used during the export process. More threads can potentially speed up the export process
2446        by utilizing multiple cores of the processor. If
2447        :type threads: int | None
2448        :return: The `export_variant_vcf` function returns the result of calling the `export_output`
2449        method with various parameters including the output file, query, threads, sort flag, and index
2450        flag. The `export_output` method is responsible for exporting the VCF data based on the
2451        specified parameters and configurations provided in the `export_variant_vcf` function.
2452        """
2453
2454        # Config
2455        config = self.get_config()
2456
2457        # Extract VCF
2458        log.debug("Export VCF...")
2459
2460        # Table variants
2461        table_variants = self.get_table_variants()
2462
2463        # Threads
2464        if not threads:
2465            threads = self.get_threads()
2466
2467        # Info fields
2468        if remove_info:
2469            if not isinstance(remove_info, str):
2470                remove_info = "."
2471            info_field = f"""'{remove_info}' as INFO"""
2472        else:
2473            info_field = "INFO"
2474
2475        # Samples fields
2476        if add_samples:
2477            if not list_samples:
2478                list_samples = self.get_header_sample_list()
2479            if list_samples:
2480                samples_fields = " , FORMAT , " + " , ".join(list_samples)
2481            else:
2482                samples_fields = ""
2483            log.debug(f"samples_fields: {samples_fields}")
2484        else:
2485            samples_fields = ""
2486
2487        # Where clause
2488        if where_clause is None:
2489            where_clause = ""
2490
2491        # Variants
2492        select_fields = """ "#CHROM", POS, ID, REF, ALT, QUAL, FILTER """
2493        sql_query_select = f""" SELECT {select_fields}, {info_field} {samples_fields} FROM {table_variants} {where_clause} """
2494        log.debug(f"sql_query_select={sql_query_select}")
2495
2496        return self.export_output(
2497            output_file=vcf_file,
2498            output_header=None,
2499            export_header=True,
2500            query=sql_query_select,
2501            parquet_partitions=None,
2502            chunk_size=config.get("chunk_size", None),
2503            threads=threads,
2504            sort=True,
2505            index=index,
2506            order_by=None,
2507        )

The export_variant_vcf function exports a VCF file with specified samples, allowing options to remove INFO field, add samples, and control compression and indexing.

Parameters
  • vcf_file: The vcf_file parameter is the name of the file where the VCF data will be written to. It is the output file that will contain the filtered VCF data based on the specified parameters
  • remove_info: The remove_info parameter in the export_variant_vcf function is a boolean flag that determines whether to remove the INFO field from the output VCF file. If set to True, the INFO field will be removed. If set to False, the INFO field will be included in, defaults to False
  • add_samples: The add_samples parameter is a boolean parameter that determines whether the samples should be added to the VCF file or not. If set to True, the samples will be added. If set to False, the samples will be removed. The default value is True, defaults to True
  • list_samples: The list_samples parameter is a list of samples that you want to include in the output VCF file. By default, all samples will be included. If you provide a list of samples, only those samples will be included in the output file
  • index: The index parameter in the export_variant_vcf function is a boolean flag that determines whether or not to create an index for the output VCF file. If index is set to True, the output VCF file will be indexed using tabix. If index, defaults to False
  • threads: The threads parameter in the export_variant_vcf function specifies the number of threads to use for exporting the VCF file. It determines how many parallel threads will be used during the export process. More threads can potentially speed up the export process by utilizing multiple cores of the processor. If
Returns

The export_variant_vcf function returns the result of calling the export_output method with various parameters including the output file, query, threads, sort flag, and index flag. The export_output method is responsible for exporting the VCF data based on the specified parameters and configurations provided in the export_variant_vcf function.

def run_commands(self, commands: list = [], threads: int = 1) -> None:
2509    def run_commands(self, commands: list = [], threads: int = 1) -> None:
2510        """
2511        It takes a list of commands and runs them in parallel using the number of threads specified
2512
2513        :param commands: A list of commands to run
2514        :param threads: The number of threads to use, defaults to 1 (optional)
2515        """
2516
2517        run_parallel_commands(commands, threads)

It takes a list of commands and runs them in parallel using the number of threads specified

Parameters
  • commands: A list of commands to run
  • threads: The number of threads to use, defaults to 1 (optional)
def get_threads(self, default: int = 1) -> int:
2519    def get_threads(self, default: int = 1) -> int:
2520        """
2521        This function returns the number of threads to use for a job, with a default value of 1 if not
2522        specified.
2523
2524        :param default: The `default` parameter in the `get_threads` method is used to specify the
2525        default number of threads to use if no specific value is provided. If no value is provided for
2526        the `threads` parameter in the configuration or input parameters, the `default` value will be
2527        used, defaults to 1
2528        :type default: int (optional)
2529        :return: the number of threads to use for the current job.
2530        """
2531
2532        # Config
2533        config = self.get_config()
2534
2535        # Param
2536        param = self.get_param()
2537
2538        # Input threads
2539        input_thread = param.get("threads", config.get("threads", None))
2540
2541        # Check threads
2542        if not input_thread:
2543            threads = default
2544        elif int(input_thread) <= 0:
2545            threads = os.cpu_count()
2546        else:
2547            threads = int(input_thread)
2548        return threads

This function returns the number of threads to use for a job, with a default value of 1 if not specified.

Parameters
  • default: The default parameter in the get_threads method is used to specify the default number of threads to use if no specific value is provided. If no value is provided for the threads parameter in the configuration or input parameters, the default value will be used, defaults to 1
Returns

the number of threads to use for the current job.

def get_memory(self, default: str = None) -> str:
2550    def get_memory(self, default: str = None) -> str:
2551        """
2552        This function retrieves the memory value from parameters or configuration with a default value
2553        if not found.
2554
2555        :param default: The `get_memory` function takes in a default value as a string parameter. This
2556        default value is used as a fallback in case the `memory` parameter is not provided in the
2557        `param` dictionary or the `config` dictionary. If `memory` is not found in either dictionary,
2558        the function
2559        :type default: str
2560        :return: The `get_memory` function returns a string value representing the memory parameter. If
2561        the `input_memory` is provided in the parameters, it will return that value. Otherwise, it will
2562        return the default value provided as an argument to the function.
2563        """
2564
2565        # Config
2566        config = self.get_config()
2567
2568        # Param
2569        param = self.get_param()
2570
2571        # Input threads
2572        input_memory = param.get("memory", config.get("memory", None))
2573
2574        # Check threads
2575        if input_memory:
2576            memory = input_memory
2577        else:
2578            memory = default
2579
2580        return memory

This function retrieves the memory value from parameters or configuration with a default value if not found.

Parameters
  • default: The get_memory function takes in a default value as a string parameter. This default value is used as a fallback in case the memory parameter is not provided in the param dictionary or the config dictionary. If memory is not found in either dictionary, the function
Returns

The get_memory function returns a string value representing the memory parameter. If the input_memory is provided in the parameters, it will return that value. Otherwise, it will return the default value provided as an argument to the function.

def update_from_vcf(self, vcf_file: str) -> None:
2582    def update_from_vcf(self, vcf_file: str) -> None:
2583        """
2584        > If the database is duckdb, then use the parquet method, otherwise use the sqlite method
2585
2586        :param vcf_file: the path to the VCF file
2587        """
2588
2589        connexion_format = self.get_connexion_format()
2590
2591        if connexion_format in ["duckdb"]:
2592            self.update_from_vcf_duckdb(vcf_file)
2593        elif connexion_format in ["sqlite"]:
2594            self.update_from_vcf_sqlite(vcf_file)

If the database is duckdb, then use the parquet method, otherwise use the sqlite method

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2596    def update_from_vcf_duckdb(self, vcf_file: str) -> None:
2597        """
2598        It takes a VCF file and updates the INFO column of the variants table in the database with the
2599        INFO column of the VCF file
2600
2601        :param vcf_file: the path to the VCF file
2602        """
2603
2604        # varaints table
2605        table_variants = self.get_table_variants()
2606
2607        # Loading VCF into temporaire table
2608        skip = self.get_header_length(file=vcf_file)
2609        vcf_df = pd.read_csv(
2610            vcf_file,
2611            sep="\t",
2612            engine="c",
2613            skiprows=skip,
2614            header=0,
2615            low_memory=False,
2616        )
2617        sql_query_update = f"""
2618        UPDATE {table_variants} as table_variants
2619            SET INFO = concat(
2620                            CASE
2621                                WHEN INFO NOT IN ('', '.')
2622                                THEN INFO
2623                                ELSE ''
2624                            END,
2625                            (
2626                                SELECT 
2627                                    concat(
2628                                        CASE
2629                                            WHEN table_variants.INFO NOT IN ('','.') AND table_parquet.INFO NOT IN ('','.')
2630                                            THEN ';'
2631                                            ELSE ''
2632                                        END
2633                                        ,
2634                                        CASE
2635                                            WHEN table_parquet.INFO NOT IN ('','.')
2636                                            THEN table_parquet.INFO
2637                                            ELSE ''
2638                                        END
2639                                    )
2640                                FROM vcf_df as table_parquet
2641                                        WHERE CAST(table_parquet.\"#CHROM\" AS VARCHAR) = CAST(table_variants.\"#CHROM\" AS VARCHAR)
2642                                        AND table_parquet.\"POS\" = table_variants.\"POS\"
2643                                        AND table_parquet.\"ALT\" = table_variants.\"ALT\"
2644                                        AND table_parquet.\"REF\" = table_variants.\"REF\"
2645                                        AND table_parquet.INFO NOT IN ('','.')
2646                            )
2647                        )
2648            ;
2649            """
2650        self.conn.execute(sql_query_update)

It takes a VCF file and updates the INFO column of the variants table in the database with the INFO column of the VCF file

Parameters
  • vcf_file: the path to the VCF file
def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2652    def update_from_vcf_sqlite(self, vcf_file: str) -> None:
2653        """
2654        It creates a temporary table in the SQLite database, loads the VCF file into the temporary
2655        table, then updates the INFO column of the variants table with the INFO column of the temporary
2656        table
2657
2658        :param vcf_file: The path to the VCF file you want to update the database with
2659        """
2660
2661        # Create a temporary table for the VCF
2662        table_vcf = "tmp_vcf"
2663        sql_create = (
2664            f"CREATE TEMPORARY TABLE {table_vcf} AS SELECT * FROM variants WHERE 0"
2665        )
2666        self.conn.execute(sql_create)
2667
2668        # Loading VCF into temporaire table
2669        vcf_df = pd.read_csv(
2670            vcf_file, sep="\t", comment="#", header=None, low_memory=False
2671        )
2672        vcf_df.columns = ["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"]
2673        vcf_df.to_sql(table_vcf, self.conn, if_exists="append", index=False)
2674
2675        # Update table 'variants' with VCF data
2676        # warning: CONCAT as || operator
2677        sql_query_update = f"""
2678            UPDATE variants as table_variants
2679            SET INFO = CASE
2680                            WHEN INFO NOT IN ('', '.')
2681                            THEN INFO
2682                            ELSE ''
2683                        END ||
2684                        (
2685                        SELECT 
2686                            CASE 
2687                                WHEN table_variants.INFO NOT IN ('','.') 
2688                                    AND table_vcf.INFO NOT IN ('','.')  
2689                                THEN ';' 
2690                                ELSE '' 
2691                            END || 
2692                            CASE 
2693                                WHEN table_vcf.INFO NOT IN ('','.') 
2694                                THEN table_vcf.INFO 
2695                                ELSE '' 
2696                            END
2697                        FROM {table_vcf} as table_vcf
2698                        WHERE table_vcf.\"#CHROM\" = table_variants.\"#CHROM\"
2699                            AND table_vcf.\"POS\" = table_variants.\"POS\"
2700                            AND table_vcf.\"ALT\" = table_variants.\"ALT\"
2701                            AND table_vcf.\"REF\" = table_variants.\"REF\"
2702                        )
2703        """
2704        self.conn.execute(sql_query_update)
2705
2706        # Drop temporary table
2707        sql_drop = f"DROP TABLE {table_vcf}"
2708        self.conn.execute(sql_drop)

It creates a temporary table in the SQLite database, loads the VCF file into the temporary table, then updates the INFO column of the variants table with the INFO column of the temporary table

Parameters
  • vcf_file: The path to the VCF file you want to update the database with
def drop_variants_table(self) -> None:
2710    def drop_variants_table(self) -> None:
2711        """
2712        > This function drops the variants table
2713        """
2714
2715        table_variants = self.get_table_variants()
2716        sql_table_variants = f"DROP TABLE IF EXISTS {table_variants}"
2717        self.conn.execute(sql_table_variants)

This function drops the variants table

def set_variant_id(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2719    def set_variant_id(
2720        self, variant_id_column: str = "variant_id", force: bool = None
2721    ) -> str:
2722        """
2723        It adds a column to the variants table called `variant_id` and populates it with a hash of the
2724        `#CHROM`, `POS`, `REF`, and `ALT` columns
2725
2726        :param variant_id_column: The name of the column to be created in the variants table, defaults
2727        to variant_id
2728        :type variant_id_column: str (optional)
2729        :param force: If True, the variant_id column will be created even if it already exists
2730        :type force: bool
2731        :return: The name of the column that contains the variant_id
2732        """
2733
2734        # Assembly
2735        assembly = self.get_param().get(
2736            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
2737        )
2738
2739        # INFO/Tag prefix
2740        prefix = self.get_explode_infos_prefix()
2741
2742        # Explode INFO/SVTYPE
2743        added_columns = self.explode_infos(prefix=prefix, fields=["SVTYPE"])
2744
2745        # variants table
2746        table_variants = self.get_table_variants()
2747
2748        # variant_id column
2749        if not variant_id_column:
2750            variant_id_column = "variant_id"
2751
2752        # Creta variant_id column
2753        if "variant_id" not in self.get_extra_infos() or force:
2754
2755            # Create column
2756            self.add_column(
2757                table_name=table_variants,
2758                column_name=variant_id_column,
2759                column_type="UBIGINT",
2760                default_value="0",
2761            )
2762
2763            # Update column
2764            self.conn.execute(
2765                f"""
2766                    UPDATE {table_variants}
2767                    SET "{variant_id_column}" = hash('{assembly}', "#CHROM", "POS", "REF", "ALT", '"{prefix}SVTYPE"')
2768                """
2769            )
2770
2771        # Remove added columns
2772        for added_column in added_columns:
2773            self.drop_column(column=added_column)
2774
2775        # return variant_id column name
2776        return variant_id_column

It adds a column to the variants table called variant_id and populates it with a hash of the #CHROM, POS, REF, and ALT columns

Parameters
  • variant_id_column: The name of the column to be created in the variants table, defaults to variant_id
  • force: If True, the variant_id column will be created even if it already exists
Returns

The name of the column that contains the variant_id

def get_variant_id_column(self, variant_id_column: str = 'variant_id', force: bool = None) -> str:
2778    def get_variant_id_column(
2779        self, variant_id_column: str = "variant_id", force: bool = None
2780    ) -> str:
2781        """
2782        This function returns the variant_id column name
2783
2784        :param variant_id_column: The name of the column in the dataframe that contains the variant IDs,
2785        defaults to variant_id
2786        :type variant_id_column: str (optional)
2787        :param force: If True, will force the variant_id to be set to the value of variant_id_column. If
2788        False, will only set the variant_id if it is not already set. If None, will set the variant_id
2789        if it is not already set, or if it is set
2790        :type force: bool
2791        :return: The variant_id column name.
2792        """
2793
2794        return self.set_variant_id(variant_id_column=variant_id_column, force=force)

This function returns the variant_id column name

Parameters
  • variant_id_column: The name of the column in the dataframe that contains the variant IDs, defaults to variant_id
  • force: If True, will force the variant_id to be set to the value of variant_id_column. If False, will only set the variant_id if it is not already set. If None, will set the variant_id if it is not already set, or if it is set
Returns

The variant_id column name.

def scan_databases( self, database_formats: list = ['parquet'], database_releases: list = ['current']) -> dict:
2800    def scan_databases(
2801        self,
2802        database_formats: list = ["parquet"],
2803        database_releases: list = ["current"],
2804    ) -> dict:
2805        """
2806        The function `scan_databases` scans for available databases based on specified formats and
2807        releases.
2808
2809        :param database_formats: The `database_formats` parameter is a list that specifies the formats
2810        of the databases to be scanned. In this case, the accepted format is "parquet"
2811        :type database_formats: list ["parquet"]
2812        :param database_releases: The `database_releases` parameter is a list that specifies the
2813        releases of the databases to be scanned. In the provided function, the default value for
2814        `database_releases` is set to `["current"]`, meaning that by default, the function will scan
2815        databases that are in the "current"
2816        :type database_releases: list
2817        :return: The function `scan_databases` returns a dictionary containing information about
2818        databases that match the specified formats and releases.
2819        """
2820
2821        # Config
2822        config = self.get_config()
2823
2824        # Param
2825        param = self.get_param()
2826
2827        # Param - Assembly
2828        assembly = param.get("assembly", config.get("assembly", None))
2829        if not assembly:
2830            assembly = DEFAULT_ASSEMBLY
2831            log.warning(f"Default assembly '{assembly}'")
2832
2833        # Scan for availabled databases
2834        log.info(
2835            f"Annotations - Check annotation parameters - Scan existing databases - Assembly {[assembly]} - Formats {database_formats} - Releases {database_releases}..."
2836        )
2837        databases_infos_dict = databases_infos(
2838            database_folder_releases=database_releases,
2839            database_formats=database_formats,
2840            assembly=assembly,
2841            config=config,
2842        )
2843        log.info(
2844            f"Annotations - Check annotation parameters - Scan existing databases - {len(databases_infos_dict)} databases found"
2845        )
2846
2847        return databases_infos_dict

The function scan_databases scans for available databases based on specified formats and releases.

Parameters
  • database_formats: The database_formats parameter is a list that specifies the formats of the databases to be scanned. In this case, the accepted format is "parquet"
  • database_releases: The database_releases parameter is a list that specifies the releases of the databases to be scanned. In the provided function, the default value for database_releases is set to ["current"], meaning that by default, the function will scan databases that are in the "current"
Returns

The function scan_databases returns a dictionary containing information about databases that match the specified formats and releases.

def annotation(self) -> None:
2849    def annotation(self) -> None:
2850        """
2851        It annotates the VCF file with the annotations specified in the config file.
2852        """
2853
2854        # Config
2855        config = self.get_config()
2856
2857        # Param
2858        param = self.get_param()
2859
2860        # Param - Assembly
2861        assembly = param.get("assembly", config.get("assembly", None))
2862        if not assembly:
2863            assembly = DEFAULT_ASSEMBLY
2864            log.warning(f"Default assembly '{assembly}'")
2865
2866        # annotations databases folders
2867        annotations_databases = set(
2868            config.get("folders", {})
2869            .get("databases", {})
2870            .get("annotations", [DEFAULT_ANNOTATIONS_FOLDER])
2871            + config.get("folders", {})
2872            .get("databases", {})
2873            .get("parquet", ["~/howard/databases/parquet/current"])
2874            + config.get("folders", {})
2875            .get("databases", {})
2876            .get("bcftools", ["~/howard/databases/bcftools/current"])
2877        )
2878
2879        # Get param annotations
2880        if param.get("annotations", None) and isinstance(
2881            param.get("annotations", None), str
2882        ):
2883            log.debug(param.get("annotations", None))
2884            param_annotation_list = param.get("annotations").split(",")
2885        else:
2886            param_annotation_list = []
2887
2888        # Each tools param
2889        if param.get("annotation_parquet", None) != None:
2890            log.debug(
2891                f"""param.get("annotation_parquet", None)={param.get("annotation_parquet", None)}"""
2892            )
2893            if isinstance(param.get("annotation_parquet", None), list):
2894                param_annotation_list.append(",".join(param.get("annotation_parquet")))
2895            else:
2896                param_annotation_list.append(param.get("annotation_parquet"))
2897        if param.get("annotation_snpsift", None) != None:
2898            if isinstance(param.get("annotation_snpsift", None), list):
2899                param_annotation_list.append(
2900                    "snpsift:"
2901                    + "+".join(param.get("annotation_snpsift")).replace(",", "+")
2902                )
2903            else:
2904                param_annotation_list.append(
2905                    "snpsift:" + param.get("annotation_snpsift").replace(",", "+")
2906                )
2907        if param.get("annotation_snpeff", None) != None:
2908            param_annotation_list.append("snpeff:" + param.get("annotation_snpeff"))
2909        if param.get("annotation_bcftools", None) != None:
2910            if isinstance(param.get("annotation_bcftools", None), list):
2911                param_annotation_list.append(
2912                    "bcftools:"
2913                    + "+".join(param.get("annotation_bcftools")).replace(",", "+")
2914                )
2915            else:
2916                param_annotation_list.append(
2917                    "bcftools:" + param.get("annotation_bcftools").replace(",", "+")
2918                )
2919        if param.get("annotation_annovar", None) != None:
2920            param_annotation_list.append("annovar:" + param.get("annotation_annovar"))
2921        if param.get("annotation_exomiser", None) != None:
2922            param_annotation_list.append("exomiser:" + param.get("annotation_exomiser"))
2923        if param.get("annotation_splice", None) != None:
2924            param_annotation_list.append("splice:" + param.get("annotation_splice"))
2925
2926        # Merge param annotations list
2927        param["annotations"] = ",".join(param_annotation_list)
2928
2929        # debug
2930        log.debug(f"param_annotations={param['annotations']}")
2931
2932        if param.get("annotations"):
2933
2934            # Log
2935            # log.info("Annotations - Check annotation parameters")
2936
2937            if not "annotation" in param:
2938                param["annotation"] = {}
2939
2940            # List of annotations parameters
2941            annotations_list_input = {}
2942            if isinstance(param.get("annotations", None), str):
2943                annotation_file_list = [
2944                    value for value in param.get("annotations", "").split(",")
2945                ]
2946                for annotation_file in annotation_file_list:
2947                    annotations_list_input[annotation_file] = {"INFO": None}
2948            else:
2949                annotations_list_input = param.get("annotations", {})
2950
2951            log.info(f"Quick Annotations:")
2952            for annotation_key in list(annotations_list_input.keys()):
2953                log.info(f"   {annotation_key}")
2954
2955            # List of annotations and associated fields
2956            annotations_list = {}
2957
2958            for annotation_file in annotations_list_input:
2959
2960                # Explode annotations if ALL
2961                if (
2962                    annotation_file.upper() == "ALL"
2963                    or annotation_file.upper().startswith("ALL:")
2964                ):
2965
2966                    # check ALL parameters (formats, releases)
2967                    annotation_file_split = annotation_file.split(":")
2968                    database_formats = "parquet"
2969                    database_releases = "current"
2970                    for annotation_file_option in annotation_file_split[1:]:
2971                        database_all_options_split = annotation_file_option.split("=")
2972                        if database_all_options_split[0] == "format":
2973                            database_formats = database_all_options_split[1].split("+")
2974                        if database_all_options_split[0] == "release":
2975                            database_releases = database_all_options_split[1].split("+")
2976
2977                    # Scan for availabled databases
2978                    databases_infos_dict = self.scan_databases(
2979                        database_formats=database_formats,
2980                        database_releases=database_releases,
2981                    )
2982
2983                    # Add found databases in annotation parameters
2984                    for database_infos in databases_infos_dict.keys():
2985                        annotations_list[database_infos] = {"INFO": None}
2986
2987                else:
2988                    annotations_list[annotation_file] = annotations_list_input[
2989                        annotation_file
2990                    ]
2991
2992            # Check each databases
2993            if len(annotations_list):
2994
2995                log.info(
2996                    f"Annotations - Check annotation parameters - Check {len(annotations_list)} databases..."
2997                )
2998
2999                for annotation_file in annotations_list:
3000
3001                    # Init
3002                    annotations = annotations_list.get(annotation_file, None)
3003
3004                    # Annotation snpEff
3005                    if annotation_file.startswith("snpeff"):
3006
3007                        log.debug(f"Quick Annotation snpEff")
3008
3009                        if "snpeff" not in param["annotation"]:
3010                            param["annotation"]["snpeff"] = {}
3011
3012                        if "options" not in param["annotation"]["snpeff"]:
3013                            param["annotation"]["snpeff"]["options"] = ""
3014
3015                        # snpEff options in annotations
3016                        param["annotation"]["snpeff"]["options"] = "".join(
3017                            annotation_file.split(":")[1:]
3018                        )
3019
3020                    # Annotation Annovar
3021                    elif annotation_file.startswith("annovar"):
3022
3023                        log.debug(f"Quick Annotation Annovar")
3024
3025                        if "annovar" not in param["annotation"]:
3026                            param["annotation"]["annovar"] = {}
3027
3028                        if "annotations" not in param["annotation"]["annovar"]:
3029                            param["annotation"]["annovar"]["annotations"] = {}
3030
3031                        # Options
3032                        annotation_file_split = annotation_file.split(":")
3033                        for annotation_file_annotation in annotation_file_split[1:]:
3034                            if annotation_file_annotation:
3035                                param["annotation"]["annovar"]["annotations"][
3036                                    annotation_file_annotation
3037                                ] = annotations
3038
3039                    # Annotation Exomiser
3040                    elif annotation_file.startswith("exomiser"):
3041
3042                        log.debug(f"Quick Annotation Exomiser")
3043
3044                        param["annotation"]["exomiser"] = params_string_to_dict(
3045                            annotation_file
3046                        )
3047
3048                    # Annotation Splice
3049                    elif annotation_file.startswith("splice"):
3050
3051                        log.debug(f"Quick Annotation Splice")
3052
3053                        param["annotation"]["splice"] = params_string_to_dict(
3054                            annotation_file
3055                        )
3056
3057                    # Annotation Parquet or BCFTOOLS
3058                    else:
3059
3060                        # Tools detection
3061                        if annotation_file.startswith("bcftools:"):
3062                            annotation_tool_initial = "bcftools"
3063                            annotation_file = ":".join(annotation_file.split(":")[1:])
3064                        elif annotation_file.startswith("snpsift:"):
3065                            annotation_tool_initial = "snpsift"
3066                            annotation_file = ":".join(annotation_file.split(":")[1:])
3067                        else:
3068                            annotation_tool_initial = None
3069
3070                        # list of files
3071                        annotation_file_list = annotation_file.replace("+", ":").split(
3072                            ":"
3073                        )
3074
3075                        for annotation_file in annotation_file_list:
3076
3077                            if annotation_file:
3078
3079                                # Annotation tool initial
3080                                annotation_tool = annotation_tool_initial
3081
3082                                # Find file
3083                                annotation_file_found = None
3084
3085                                # Expand user
3086                                annotation_file = full_path(annotation_file)
3087
3088                                if os.path.exists(annotation_file):
3089                                    annotation_file_found = annotation_file
3090
3091                                else:
3092                                    # Find within assembly folders
3093                                    for annotations_database in annotations_databases:
3094                                        found_files = find_all(
3095                                            annotation_file,
3096                                            os.path.join(
3097                                                annotations_database, assembly
3098                                            ),
3099                                        )
3100                                        if len(found_files) > 0:
3101                                            annotation_file_found = found_files[0]
3102                                            break
3103                                    if not annotation_file_found and not assembly:
3104                                        # Find within folders
3105                                        for (
3106                                            annotations_database
3107                                        ) in annotations_databases:
3108                                            found_files = find_all(
3109                                                annotation_file, annotations_database
3110                                            )
3111                                            if len(found_files) > 0:
3112                                                annotation_file_found = found_files[0]
3113                                                break
3114                                log.debug(
3115                                    f"for {annotation_file} annotation_file_found={annotation_file_found}"
3116                                )
3117
3118                                # Full path
3119                                annotation_file_found = full_path(annotation_file_found)
3120
3121                                if annotation_file_found:
3122
3123                                    database = Database(database=annotation_file_found)
3124                                    quick_annotation_format = database.get_format()
3125                                    quick_annotation_is_compressed = (
3126                                        database.is_compressed()
3127                                    )
3128                                    quick_annotation_is_indexed = os.path.exists(
3129                                        f"{annotation_file_found}.tbi"
3130                                    )
3131                                    bcftools_preference = False
3132
3133                                    # Check Annotation Tool
3134                                    if not annotation_tool:
3135                                        if (
3136                                            bcftools_preference
3137                                            and quick_annotation_format
3138                                            in ["vcf", "bed"]
3139                                            and quick_annotation_is_compressed
3140                                            and quick_annotation_is_indexed
3141                                        ):
3142                                            annotation_tool = "bcftools"
3143                                        elif quick_annotation_format in [
3144                                            "vcf",
3145                                            "bed",
3146                                            "tsv",
3147                                            "tsv",
3148                                            "csv",
3149                                            "json",
3150                                            "tbl",
3151                                            "parquet",
3152                                            "duckdb",
3153                                        ]:
3154                                            annotation_tool = "parquet"
3155                                        else:
3156                                            log.error(
3157                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3158                                            )
3159                                            raise ValueError(
3160                                                f"Quick Annotation File {annotation_file_found} - Format {quick_annotation_format} not supported yet"
3161                                            )
3162
3163                                    log.debug(
3164                                        f"Quick Annotation File {annotation_file} - Annotation tool: {annotation_tool}"
3165                                    )
3166
3167                                    # Annotation Tool dispatch
3168                                    if annotation_tool:
3169                                        if annotation_tool not in param["annotation"]:
3170                                            param["annotation"][annotation_tool] = {}
3171                                        if (
3172                                            "annotations"
3173                                            not in param["annotation"][annotation_tool]
3174                                        ):
3175                                            param["annotation"][annotation_tool][
3176                                                "annotations"
3177                                            ] = {}
3178                                        param["annotation"][annotation_tool][
3179                                            "annotations"
3180                                        ][annotation_file_found] = annotations
3181
3182                                else:
3183                                    log.error(
3184                                        f"Quick Annotation File {annotation_file} does NOT exist"
3185                                    )
3186
3187                self.set_param(param)
3188
3189        if param.get("annotation", None):
3190            log.info("Annotations")
3191            if param.get("annotation", {}).get("parquet", None):
3192                log.info("Annotations 'parquet'...")
3193                self.annotation_parquet()
3194            if param.get("annotation", {}).get("bcftools", None):
3195                log.info("Annotations 'bcftools'...")
3196                self.annotation_bcftools()
3197            if param.get("annotation", {}).get("snpsift", None):
3198                log.info("Annotations 'snpsift'...")
3199                self.annotation_snpsift()
3200            if param.get("annotation", {}).get("annovar", None):
3201                log.info("Annotations 'annovar'...")
3202                self.annotation_annovar()
3203            if param.get("annotation", {}).get("snpeff", None):
3204                log.info("Annotations 'snpeff'...")
3205                self.annotation_snpeff()
3206            if param.get("annotation", {}).get("exomiser", None) is not None:
3207                log.info("Annotations 'exomiser'...")
3208                self.annotation_exomiser()
3209            if param.get("annotation", {}).get("splice", None) is not None:
3210                log.info("Annotations 'splice' ...")
3211                self.annotation_splice()
3212
3213        # Explode INFOS fields into table fields
3214        if self.get_explode_infos():
3215            self.explode_infos(
3216                prefix=self.get_explode_infos_prefix(),
3217                fields=self.get_explode_infos_fields(),
3218                force=True,
3219            )

It annotates the VCF file with the annotations specified in the config file.

def annotation_snpsift(self, threads: int = None) -> None:
3221    def annotation_snpsift(self, threads: int = None) -> None:
3222        """
3223        This function annotate with bcftools
3224
3225        :param threads: Number of threads to use
3226        :return: the value of the variable "return_value".
3227        """
3228
3229        # DEBUG
3230        log.debug("Start annotation with bcftools databases")
3231
3232        # Threads
3233        if not threads:
3234            threads = self.get_threads()
3235        log.debug("Threads: " + str(threads))
3236
3237        # Config
3238        config = self.get_config()
3239        log.debug("Config: " + str(config))
3240
3241        # Config - snpSift
3242        snpsift_bin_command = get_bin_command(
3243            bin="SnpSift.jar",
3244            tool="snpsift",
3245            bin_type="jar",
3246            config=config,
3247            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
3248        )
3249        if not snpsift_bin_command:
3250            msg_err = f"Annotation failed: no snpsift bin '{snpsift_bin_command}'"
3251            log.error(msg_err)
3252            raise ValueError(msg_err)
3253
3254        # Config - bcftools
3255        bcftools_bin_command = get_bin_command(
3256            bin="bcftools",
3257            tool="bcftools",
3258            bin_type="bin",
3259            config=config,
3260            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3261        )
3262        if not bcftools_bin_command:
3263            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3264            log.error(msg_err)
3265            raise ValueError(msg_err)
3266
3267        # Config - BCFTools databases folders
3268        databases_folders = set(
3269            self.get_config()
3270            .get("folders", {})
3271            .get("databases", {})
3272            .get("annotations", ["."])
3273            + self.get_config()
3274            .get("folders", {})
3275            .get("databases", {})
3276            .get("bcftools", ["."])
3277        )
3278        log.debug("Databases annotations: " + str(databases_folders))
3279
3280        # Param
3281        annotations = (
3282            self.get_param()
3283            .get("annotation", {})
3284            .get("snpsift", {})
3285            .get("annotations", None)
3286        )
3287        log.debug("Annotations: " + str(annotations))
3288
3289        # Assembly
3290        assembly = self.get_param().get(
3291            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3292        )
3293
3294        # Data
3295        table_variants = self.get_table_variants()
3296
3297        # Check if not empty
3298        log.debug("Check if not empty")
3299        sql_query_chromosomes = (
3300            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3301        )
3302        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3303        if not sql_query_chromosomes_df["count"][0]:
3304            log.info(f"VCF empty")
3305            return
3306
3307        # VCF header
3308        vcf_reader = self.get_header()
3309        log.debug("Initial header: " + str(vcf_reader.infos))
3310
3311        # Existing annotations
3312        for vcf_annotation in self.get_header().infos:
3313
3314            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3315            log.debug(
3316                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3317            )
3318
3319        if annotations:
3320
3321            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
3322
3323                # Export VCF file
3324                tmp_vcf_name = os.path.join(tmp_dir, "input.vcf.gz")
3325
3326                # Init
3327                commands = {}
3328
3329                for annotation in annotations:
3330                    annotation_fields = annotations[annotation]
3331
3332                    # Annotation Name
3333                    annotation_name = os.path.basename(annotation)
3334
3335                    if not annotation_fields:
3336                        annotation_fields = {"INFO": None}
3337
3338                    log.debug(f"Annotation '{annotation_name}'")
3339                    log.debug(
3340                        f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3341                    )
3342
3343                    # Create Database
3344                    database = Database(
3345                        database=annotation,
3346                        databases_folders=databases_folders,
3347                        assembly=assembly,
3348                    )
3349
3350                    # Find files
3351                    db_file = database.get_database()
3352                    db_file = full_path(db_file)
3353                    db_hdr_file = database.get_header_file()
3354                    db_hdr_file = full_path(db_hdr_file)
3355                    db_file_type = database.get_format()
3356                    db_tbi_file = f"{db_file}.tbi"
3357                    db_file_compressed = database.is_compressed()
3358
3359                    # Check if compressed
3360                    if not db_file_compressed:
3361                        log.error(
3362                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3363                        )
3364                        raise ValueError(
3365                            f"Annotation '{annotation}' - {db_file} NOT compressed file"
3366                        )
3367
3368                    # Check if indexed
3369                    if not os.path.exists(db_tbi_file):
3370                        log.error(
3371                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3372                        )
3373                        raise ValueError(
3374                            f"Annotation '{annotation}' - {db_file} NOT indexed file"
3375                        )
3376
3377                    # Check index - try to create if not exists
3378                    if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3379                        log.error("Annotation failed: database not valid")
3380                        log.error(f"Annotation annotation file: {db_file}")
3381                        log.error(f"Annotation annotation header: {db_hdr_file}")
3382                        log.error(f"Annotation annotation index: {db_tbi_file}")
3383                        raise ValueError(
3384                            f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3385                        )
3386                    else:
3387
3388                        log.debug(
3389                            f"Annotation '{annotation}' - file: "
3390                            + str(db_file)
3391                            + " and "
3392                            + str(db_hdr_file)
3393                        )
3394
3395                        # Load header as VCF object
3396                        db_hdr_vcf = Variants(input=db_hdr_file)
3397                        db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3398                        log.debug(
3399                            "Annotation database header: "
3400                            + str(db_hdr_vcf_header_infos)
3401                        )
3402
3403                        # For all fields in database
3404                        annotation_fields_full = False
3405                        if "ALL" in annotation_fields or "INFO" in annotation_fields:
3406                            annotation_fields = {
3407                                key: key for key in db_hdr_vcf_header_infos
3408                            }
3409                            log.debug(
3410                                "Annotation database header - All annotations added: "
3411                                + str(annotation_fields)
3412                            )
3413                            annotation_fields_full = True
3414
3415                        # # Create file for field rename
3416                        # log.debug("Create file for field rename")
3417                        # tmp_rename = NamedTemporaryFile(
3418                        #     prefix=self.get_prefix(),
3419                        #     dir=self.get_tmp_dir(),
3420                        #     suffix=".rename",
3421                        #     delete=False,
3422                        # )
3423                        # tmp_rename_name = tmp_rename.name
3424                        # tmp_files.append(tmp_rename_name)
3425
3426                        # Number of fields
3427                        nb_annotation_field = 0
3428                        annotation_list = []
3429                        annotation_infos_rename_list = []
3430
3431                        for annotation_field in annotation_fields:
3432
3433                            # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3434                            annotation_fields_new_name = annotation_fields.get(
3435                                annotation_field, annotation_field
3436                            )
3437                            if not annotation_fields_new_name:
3438                                annotation_fields_new_name = annotation_field
3439
3440                            # Check if field is in DB and if field is not elready in input data
3441                            if (
3442                                annotation_field in db_hdr_vcf.get_header().infos
3443                                and annotation_fields_new_name
3444                                not in self.get_header().infos
3445                            ):
3446
3447                                log.info(
3448                                    f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3449                                )
3450
3451                                # BCFTools annotate param to rename fields
3452                                if annotation_field != annotation_fields_new_name:
3453                                    annotation_infos_rename_list.append(
3454                                        f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3455                                    )
3456
3457                                # Add INFO field to header
3458                                db_hdr_vcf_header_infos_number = (
3459                                    db_hdr_vcf_header_infos[annotation_field].num or "."
3460                                )
3461                                db_hdr_vcf_header_infos_type = (
3462                                    db_hdr_vcf_header_infos[annotation_field].type
3463                                    or "String"
3464                                )
3465                                db_hdr_vcf_header_infos_description = (
3466                                    db_hdr_vcf_header_infos[annotation_field].desc
3467                                    or f"{annotation_field} description"
3468                                )
3469                                db_hdr_vcf_header_infos_source = (
3470                                    db_hdr_vcf_header_infos[annotation_field].source
3471                                    or "unknown"
3472                                )
3473                                db_hdr_vcf_header_infos_version = (
3474                                    db_hdr_vcf_header_infos[annotation_field].version
3475                                    or "unknown"
3476                                )
3477
3478                                vcf_reader.infos[annotation_fields_new_name] = (
3479                                    vcf.parser._Info(
3480                                        annotation_fields_new_name,
3481                                        db_hdr_vcf_header_infos_number,
3482                                        db_hdr_vcf_header_infos_type,
3483                                        db_hdr_vcf_header_infos_description,
3484                                        db_hdr_vcf_header_infos_source,
3485                                        db_hdr_vcf_header_infos_version,
3486                                        self.code_type_map[
3487                                            db_hdr_vcf_header_infos_type
3488                                        ],
3489                                    )
3490                                )
3491
3492                                annotation_list.append(annotation_field)
3493
3494                                nb_annotation_field += 1
3495
3496                            else:
3497
3498                                if (
3499                                    annotation_field
3500                                    not in db_hdr_vcf.get_header().infos
3501                                ):
3502                                    log.warning(
3503                                        f"Annotation '{annotation_name}' - '{annotation_field}' - not available in vcf/bed file"
3504                                    )
3505                                if (
3506                                    annotation_fields_new_name
3507                                    in self.get_header().infos
3508                                ):
3509                                    log.warning(
3510                                        f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' - already exists (skipped)"
3511                                    )
3512
3513                        log.info(
3514                            f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3515                        )
3516
3517                        annotation_infos = ",".join(annotation_list)
3518
3519                        if annotation_infos != "":
3520
3521                            # Annotated VCF (and error file)
3522                            tmp_annotation_vcf_name = os.path.join(
3523                                tmp_dir, os.path.basename(annotation) + ".vcf.gz"
3524                            )
3525                            tmp_annotation_vcf_name_err = (
3526                                tmp_annotation_vcf_name + ".err"
3527                            )
3528
3529                            # Add fields to annotate
3530                            if not annotation_fields_full:
3531                                annotation_infos_option = f"-info {annotation_infos}"
3532                            else:
3533                                annotation_infos_option = ""
3534
3535                            # Info fields rename
3536                            if annotation_infos_rename_list:
3537                                annotation_infos_rename = " -c " + ",".join(
3538                                    annotation_infos_rename_list
3539                                )
3540                            else:
3541                                annotation_infos_rename = ""
3542
3543                            # Annotate command
3544                            command_annotate = f"{snpsift_bin_command} annotate {annotation_infos_option} {db_file} {tmp_vcf_name} | {bcftools_bin_command} annotate --threads={threads} {annotation_infos_rename} -Oz1 -o {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3545
3546                            # Add command
3547                            commands[command_annotate] = tmp_annotation_vcf_name
3548
3549                if commands:
3550
3551                    # Export VCF file
3552                    self.export_variant_vcf(
3553                        vcf_file=tmp_vcf_name,
3554                        remove_info=True,
3555                        add_samples=False,
3556                        index=True,
3557                    )
3558                    shutil.copyfile(tmp_vcf_name, "/tmp/input.vcf")
3559
3560                    # Num command
3561                    nb_command = 0
3562
3563                    # Annotate
3564                    for command_annotate in commands:
3565                        nb_command += 1
3566                        log.info(
3567                            f"Annotation - Annotate [{nb_command}/{len(commands)}]..."
3568                        )
3569                        log.debug(f"command_annotate={command_annotate}")
3570                        run_parallel_commands([command_annotate], threads)
3571
3572                        # Debug
3573                        shutil.copyfile(commands[command_annotate], "/tmp/snpsift.vcf")
3574
3575                        # Update variants
3576                        log.info(
3577                            f"Annotation - Updating [{nb_command}/{len(commands)}]..."
3578                        )
3579                        self.update_from_vcf(commands[command_annotate])

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_bcftools(self, threads: int = None) -> None:
3581    def annotation_bcftools(self, threads: int = None) -> None:
3582        """
3583        This function annotate with bcftools
3584
3585        :param threads: Number of threads to use
3586        :return: the value of the variable "return_value".
3587        """
3588
3589        # DEBUG
3590        log.debug("Start annotation with bcftools databases")
3591
3592        # Threads
3593        if not threads:
3594            threads = self.get_threads()
3595        log.debug("Threads: " + str(threads))
3596
3597        # Config
3598        config = self.get_config()
3599        log.debug("Config: " + str(config))
3600
3601        # DEBUG
3602        delete_tmp = True
3603        if self.get_config().get("verbosity", "warning") in ["debug"]:
3604            delete_tmp = False
3605            log.debug("Delete tmp files/folders: " + str(delete_tmp))
3606
3607        # Config - BCFTools bin command
3608        bcftools_bin_command = get_bin_command(
3609            bin="bcftools",
3610            tool="bcftools",
3611            bin_type="bin",
3612            config=config,
3613            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
3614        )
3615        if not bcftools_bin_command:
3616            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
3617            log.error(msg_err)
3618            raise ValueError(msg_err)
3619
3620        # Config - BCFTools databases folders
3621        databases_folders = set(
3622            self.get_config()
3623            .get("folders", {})
3624            .get("databases", {})
3625            .get("annotations", ["."])
3626            + self.get_config()
3627            .get("folders", {})
3628            .get("databases", {})
3629            .get("bcftools", ["."])
3630        )
3631        log.debug("Databases annotations: " + str(databases_folders))
3632
3633        # Param
3634        annotations = (
3635            self.get_param()
3636            .get("annotation", {})
3637            .get("bcftools", {})
3638            .get("annotations", None)
3639        )
3640        log.debug("Annotations: " + str(annotations))
3641
3642        # Assembly
3643        assembly = self.get_param().get(
3644            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
3645        )
3646
3647        # Data
3648        table_variants = self.get_table_variants()
3649
3650        # Check if not empty
3651        log.debug("Check if not empty")
3652        sql_query_chromosomes = (
3653            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
3654        )
3655        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
3656        if not sql_query_chromosomes_df["count"][0]:
3657            log.info(f"VCF empty")
3658            return
3659
3660        # Export in VCF
3661        log.debug("Create initial file to annotate")
3662        tmp_vcf = NamedTemporaryFile(
3663            prefix=self.get_prefix(),
3664            dir=self.get_tmp_dir(),
3665            suffix=".vcf.gz",
3666            delete=False,
3667        )
3668        tmp_vcf_name = tmp_vcf.name
3669
3670        # VCF header
3671        vcf_reader = self.get_header()
3672        log.debug("Initial header: " + str(vcf_reader.infos))
3673
3674        # Existing annotations
3675        for vcf_annotation in self.get_header().infos:
3676
3677            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
3678            log.debug(
3679                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
3680            )
3681
3682        if annotations:
3683
3684            tmp_ann_vcf_list = []
3685            commands = []
3686            tmp_files = []
3687            err_files = []
3688
3689            for annotation in annotations:
3690                annotation_fields = annotations[annotation]
3691
3692                # Annotation Name
3693                annotation_name = os.path.basename(annotation)
3694
3695                if not annotation_fields:
3696                    annotation_fields = {"INFO": None}
3697
3698                log.debug(f"Annotation '{annotation_name}'")
3699                log.debug(
3700                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
3701                )
3702
3703                # Create Database
3704                database = Database(
3705                    database=annotation,
3706                    databases_folders=databases_folders,
3707                    assembly=assembly,
3708                )
3709
3710                # Find files
3711                db_file = database.get_database()
3712                db_file = full_path(db_file)
3713                db_hdr_file = database.get_header_file()
3714                db_hdr_file = full_path(db_hdr_file)
3715                db_file_type = database.get_format()
3716                db_tbi_file = f"{db_file}.tbi"
3717                db_file_compressed = database.is_compressed()
3718
3719                # Check if compressed
3720                if not db_file_compressed:
3721                    log.error(
3722                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3723                    )
3724                    raise ValueError(
3725                        f"Annotation '{annotation}' - {db_file} NOT compressed file"
3726                    )
3727
3728                # Check if indexed
3729                if not os.path.exists(db_tbi_file):
3730                    log.error(f"Annotation '{annotation}' - {db_file} NOT indexed file")
3731                    raise ValueError(
3732                        f"Annotation '{annotation}' - {db_file} NOT indexed file"
3733                    )
3734
3735                # Check index - try to create if not exists
3736                if not os.path.exists(db_file) or not os.path.exists(db_hdr_file):
3737                    log.error("Annotation failed: database not valid")
3738                    log.error(f"Annotation annotation file: {db_file}")
3739                    log.error(f"Annotation annotation header: {db_hdr_file}")
3740                    log.error(f"Annotation annotation index: {db_tbi_file}")
3741                    raise ValueError(
3742                        f"Annotation failed: database not valid - annotation file {db_file} / annotation header {db_hdr_file} / annotation index {db_tbi_file} / annotation compression {db_file_compressed}"
3743                    )
3744                else:
3745
3746                    log.debug(
3747                        f"Annotation '{annotation}' - file: "
3748                        + str(db_file)
3749                        + " and "
3750                        + str(db_hdr_file)
3751                    )
3752
3753                    # Load header as VCF object
3754                    db_hdr_vcf = Variants(input=db_hdr_file)
3755                    db_hdr_vcf_header_infos = db_hdr_vcf.get_header().infos
3756                    log.debug(
3757                        "Annotation database header: " + str(db_hdr_vcf_header_infos)
3758                    )
3759
3760                    # For all fields in database
3761                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
3762                        annotation_fields = {
3763                            key: key for key in db_hdr_vcf_header_infos
3764                        }
3765                        log.debug(
3766                            "Annotation database header - All annotations added: "
3767                            + str(annotation_fields)
3768                        )
3769
3770                    # Number of fields
3771                    nb_annotation_field = 0
3772                    annotation_list = []
3773
3774                    for annotation_field in annotation_fields:
3775
3776                        # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
3777                        annotation_fields_new_name = annotation_fields.get(
3778                            annotation_field, annotation_field
3779                        )
3780                        if not annotation_fields_new_name:
3781                            annotation_fields_new_name = annotation_field
3782
3783                        # Check if field is in DB and if field is not elready in input data
3784                        if (
3785                            annotation_field in db_hdr_vcf.get_header().infos
3786                            and annotation_fields_new_name
3787                            not in self.get_header().infos
3788                        ):
3789
3790                            log.info(
3791                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'"
3792                            )
3793
3794                            # Add INFO field to header
3795                            db_hdr_vcf_header_infos_number = (
3796                                db_hdr_vcf_header_infos[annotation_field].num or "."
3797                            )
3798                            db_hdr_vcf_header_infos_type = (
3799                                db_hdr_vcf_header_infos[annotation_field].type
3800                                or "String"
3801                            )
3802                            db_hdr_vcf_header_infos_description = (
3803                                db_hdr_vcf_header_infos[annotation_field].desc
3804                                or f"{annotation_field} description"
3805                            )
3806                            db_hdr_vcf_header_infos_source = (
3807                                db_hdr_vcf_header_infos[annotation_field].source
3808                                or "unknown"
3809                            )
3810                            db_hdr_vcf_header_infos_version = (
3811                                db_hdr_vcf_header_infos[annotation_field].version
3812                                or "unknown"
3813                            )
3814
3815                            vcf_reader.infos[annotation_fields_new_name] = (
3816                                vcf.parser._Info(
3817                                    annotation_fields_new_name,
3818                                    db_hdr_vcf_header_infos_number,
3819                                    db_hdr_vcf_header_infos_type,
3820                                    db_hdr_vcf_header_infos_description,
3821                                    db_hdr_vcf_header_infos_source,
3822                                    db_hdr_vcf_header_infos_version,
3823                                    self.code_type_map[db_hdr_vcf_header_infos_type],
3824                                )
3825                            )
3826
3827                            # annotation_list.append(annotation_field)
3828                            if annotation_field != annotation_fields_new_name:
3829                                annotation_list.append(
3830                                    f"{annotation_fields_new_name}:=INFO/{annotation_field}"
3831                                )
3832                            else:
3833                                annotation_list.append(annotation_field)
3834
3835                            nb_annotation_field += 1
3836
3837                        else:
3838
3839                            if annotation_field not in db_hdr_vcf.get_header().infos:
3840                                log.warning(
3841                                    f"Annotation '{annotation}' - '{annotation_field}' - not available in vcf/bed file"
3842                                )
3843                            if annotation_fields_new_name in self.get_header().infos:
3844                                log.warning(
3845                                    f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
3846                                )
3847
3848                    log.info(
3849                        f"Annotation '{annotation_name}' - {nb_annotation_field} annotations available in vcf/bed file"
3850                    )
3851
3852                    annotation_infos = ",".join(annotation_list)
3853
3854                    if annotation_infos != "":
3855
3856                        # Protect header for bcftools (remove "#CHROM" and variants line)
3857                        log.debug("Protect Header file - remove #CHROM line if exists")
3858                        tmp_header_vcf = NamedTemporaryFile(
3859                            prefix=self.get_prefix(),
3860                            dir=self.get_tmp_dir(),
3861                            suffix=".hdr",
3862                            delete=False,
3863                        )
3864                        tmp_header_vcf_name = tmp_header_vcf.name
3865                        tmp_files.append(tmp_header_vcf_name)
3866                        # Command
3867                        if db_hdr_file.endswith(".gz"):
3868                            command_extract_header = f"zcat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3869                        else:
3870                            command_extract_header = f"cat {db_hdr_file} | grep '^##' > {tmp_header_vcf_name}"
3871                        # Run
3872                        run_parallel_commands([command_extract_header], 1)
3873
3874                        # Find chomosomes
3875                        log.debug("Find chromosomes ")
3876                        sql_query_chromosomes = f"""SELECT table_variants.\"#CHROM\" as CHROM FROM {table_variants} as table_variants GROUP BY table_variants.\"#CHROM\""""
3877                        sql_query_chromosomes_df = self.get_query_to_df(
3878                            sql_query_chromosomes
3879                        )
3880                        chomosomes_list = list(sql_query_chromosomes_df["CHROM"])
3881
3882                        log.debug("Chromosomes found: " + str(list(chomosomes_list)))
3883
3884                        # BED columns in the annotation file
3885                        if db_file_type in ["bed"]:
3886                            annotation_infos = "CHROM,POS,POS," + annotation_infos
3887
3888                        for chrom in chomosomes_list:
3889
3890                            # Create BED on initial VCF
3891                            log.debug("Create BED on initial VCF: " + str(tmp_vcf_name))
3892                            tmp_bed = NamedTemporaryFile(
3893                                prefix=self.get_prefix(),
3894                                dir=self.get_tmp_dir(),
3895                                suffix=".bed",
3896                                delete=False,
3897                            )
3898                            tmp_bed_name = tmp_bed.name
3899                            tmp_files.append(tmp_bed_name)
3900
3901                            # Detecte regions
3902                            log.debug(
3903                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Start detecting regions..."
3904                            )
3905                            window = 1000000
3906                            sql_query_intervals_for_bed = f"""
3907                                SELECT  \"#CHROM\",
3908                                        CASE WHEN \"POS\"-{window}-1 < 0 THEN 0 ELSE \"POS\"-{window}-1 END,
3909                                        \"POS\"+{window}
3910                                FROM {table_variants} as table_variants
3911                                WHERE table_variants.\"#CHROM\" = '{chrom}'
3912                            """
3913                            regions = self.conn.execute(
3914                                sql_query_intervals_for_bed
3915                            ).fetchall()
3916                            merged_regions = merge_regions(regions)
3917                            log.debug(
3918                                f"Annotation '{annotation}' - Chromosome '{chrom}' - Stop detecting regions..."
3919                            )
3920
3921                            header = ["#CHROM", "START", "END"]
3922                            with open(tmp_bed_name, "w") as f:
3923                                # Write the header with tab delimiter
3924                                f.write("\t".join(header) + "\n")
3925                                for d in merged_regions:
3926                                    # Write each data row with tab delimiter
3927                                    f.write("\t".join(map(str, d)) + "\n")
3928
3929                            # Tmp files
3930                            tmp_annotation_vcf = NamedTemporaryFile(
3931                                prefix=self.get_prefix(),
3932                                dir=self.get_tmp_dir(),
3933                                suffix=".vcf.gz",
3934                                delete=False,
3935                            )
3936                            tmp_annotation_vcf_name = tmp_annotation_vcf.name
3937                            tmp_files.append(tmp_annotation_vcf_name)
3938                            tmp_ann_vcf_list.append(f"{tmp_annotation_vcf_name}")
3939                            tmp_annotation_vcf_name_err = (
3940                                tmp_annotation_vcf_name + ".err"
3941                            )
3942                            err_files.append(tmp_annotation_vcf_name_err)
3943
3944                            # Annotate Command
3945                            log.debug(
3946                                f"Annotation '{annotation}' - add bcftools command"
3947                            )
3948
3949                            # Command
3950                            command_annotate = f"{bcftools_bin_command} annotate --pair-logic exact --regions-file={tmp_bed_name} -a {db_file} -h {tmp_header_vcf_name} -c {annotation_infos} {tmp_vcf_name} -o {tmp_annotation_vcf_name} -Oz1 2>>{tmp_annotation_vcf_name_err} && tabix {tmp_annotation_vcf_name} 2>>{tmp_annotation_vcf_name_err} "
3951
3952                            # Add command
3953                            commands.append(command_annotate)
3954
3955            # if some commands
3956            if commands:
3957
3958                # Export VCF file
3959                self.export_variant_vcf(
3960                    vcf_file=tmp_vcf_name,
3961                    remove_info=True,
3962                    add_samples=False,
3963                    index=True,
3964                )
3965
3966                # Threads
3967                # calculate threads for annotated commands
3968                if commands:
3969                    threads_bcftools_annotate = round(threads / len(commands))
3970                else:
3971                    threads_bcftools_annotate = 1
3972
3973                if not threads_bcftools_annotate:
3974                    threads_bcftools_annotate = 1
3975
3976                # Add threads option to bcftools commands
3977                if threads_bcftools_annotate > 1:
3978                    commands_threaded = []
3979                    for command in commands:
3980                        commands_threaded.append(
3981                            command.replace(
3982                                f"{bcftools_bin_command} annotate ",
3983                                f"{bcftools_bin_command} annotate --threads={threads_bcftools_annotate} ",
3984                            )
3985                        )
3986                    commands = commands_threaded
3987
3988                # Command annotation multithreading
3989                log.debug(f"Annotation - Annotation commands: " + str(commands))
3990                log.info(
3991                    f"Annotation - Annotation multithreaded in "
3992                    + str(len(commands))
3993                    + " commands"
3994                )
3995
3996                run_parallel_commands(commands, threads)
3997
3998                # Merge
3999                tmp_ann_vcf_list_cmd = " ".join(tmp_ann_vcf_list)
4000
4001                if tmp_ann_vcf_list_cmd:
4002
4003                    # Tmp file
4004                    tmp_annotate_vcf = NamedTemporaryFile(
4005                        prefix=self.get_prefix(),
4006                        dir=self.get_tmp_dir(),
4007                        suffix=".vcf.gz",
4008                        delete=True,
4009                    )
4010                    tmp_annotate_vcf_name = tmp_annotate_vcf.name
4011                    tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
4012                    err_files.append(tmp_annotate_vcf_name_err)
4013
4014                    # Tmp file remove command
4015                    tmp_files_remove_command = ""
4016                    if tmp_files:
4017                        tmp_files_remove_command = " && rm -f " + " ".join(tmp_files)
4018
4019                    # Command merge
4020                    merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_ann_vcf_list_cmd} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} {tmp_files_remove_command}"
4021                    log.info(
4022                        f"Annotation - Annotation merging "
4023                        + str(len(commands))
4024                        + " annotated files"
4025                    )
4026                    log.debug(f"Annotation - merge command: {merge_command}")
4027                    run_parallel_commands([merge_command], 1)
4028
4029                    # Error messages
4030                    log.info(f"Error/Warning messages:")
4031                    error_message_command_all = []
4032                    error_message_command_warning = []
4033                    error_message_command_err = []
4034                    for err_file in err_files:
4035                        with open(err_file, "r") as f:
4036                            for line in f:
4037                                message = line.strip()
4038                                error_message_command_all.append(message)
4039                                if line.startswith("[W::"):
4040                                    error_message_command_warning.append(message)
4041                                if line.startswith("[E::"):
4042                                    error_message_command_err.append(
4043                                        f"{err_file}: " + message
4044                                    )
4045                    # log info
4046                    for message in list(
4047                        set(error_message_command_err + error_message_command_warning)
4048                    ):
4049                        log.info(f"   {message}")
4050                    # debug info
4051                    for message in list(set(error_message_command_all)):
4052                        log.debug(f"   {message}")
4053                    # failed
4054                    if len(error_message_command_err):
4055                        log.error("Annotation failed: Error in commands")
4056                        raise ValueError("Annotation failed: Error in commands")
4057
4058                    # Update variants
4059                    log.info(f"Annotation - Updating...")
4060                    self.update_from_vcf(tmp_annotate_vcf_name)

This function annotate with bcftools

Parameters
  • threads: Number of threads to use
Returns

the value of the variable "return_value".

def annotation_exomiser(self, threads: int = None) -> None:
4062    def annotation_exomiser(self, threads: int = None) -> None:
4063        """
4064        This function annotate with Exomiser
4065
4066        This function uses args as parameters, in section "annotation" -> "exomiser", with sections:
4067        - "analysis" (dict/file):
4068            Full analysis dictionnary parameters (see Exomiser docs).
4069            Either a dict, or a file in JSON or YAML format.
4070            These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO)
4071            Default : None
4072        - "preset" (string):
4073            Analysis preset (available in config folder).
4074            Used if no full "analysis" is provided.
4075            Default: "exome"
4076        - "phenopacket" (dict/file):
4077            Samples and phenotipic features parameters (see Exomiser docs).
4078            Either a dict, or a file in JSON or YAML format.
4079            Default: None
4080        - "subject" (dict):
4081            Sample parameters (see Exomiser docs).
4082            Example:
4083                "subject":
4084                    {
4085                        "id": "ISDBM322017",
4086                        "sex": "FEMALE"
4087                    }
4088            Default: None
4089        - "sample" (string):
4090            Sample name to construct "subject" section:
4091                "subject":
4092                    {
4093                        "id": "<sample>",
4094                        "sex": "UNKNOWN_SEX"
4095                    }
4096            Default: None
4097        - "phenotypicFeatures" (dict)
4098            Phenotypic features to construct "subject" section.
4099            Example:
4100                "phenotypicFeatures":
4101                    [
4102                        { "type": { "id": "HP:0001159", "label": "Syndactyly" } },
4103                        { "type": { "id": "HP:0000486", "label": "Strabismus" } }
4104                    ]
4105        - "hpo" (list)
4106            List of HPO ids as phenotypic features.
4107            Example:
4108                "hpo": ['0001156', '0001363', '0011304', '0010055']
4109            Default: []
4110        - "outputOptions" (dict):
4111            Output options (see Exomiser docs).
4112            Default:
4113                "output_options" =
4114                    {
4115                        "outputContributingVariantsOnly": False,
4116                        "numGenes": 0,
4117                        "outputFormats": ["TSV_VARIANT", "VCF"]
4118                    }
4119        - "transcript_source" (string):
4120            Transcript source (either "refseq", "ucsc", "ensembl")
4121            Default: "refseq"
4122        - "exomiser_to_info" (boolean):
4123            Add exomiser TSV file columns as INFO fields in VCF.
4124            Default: False
4125        - "release" (string):
4126            Exomise database release.
4127            If not exists, database release will be downloaded (take a while).
4128            Default: None (provided by application.properties configuration file)
4129        - "exomiser_application_properties" (file):
4130            Exomiser configuration file (see Exomiser docs).
4131            Useful to automatically download databases (especially for specific genome databases).
4132
4133        Notes:
4134        - If no sample in parameters, first sample in VCF will be chosen
4135        - If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
4136
4137        :param threads: The number of threads to use
4138        :return: None.
4139        """
4140
4141        # DEBUG
4142        log.debug("Start annotation with Exomiser databases")
4143
4144        # Threads
4145        if not threads:
4146            threads = self.get_threads()
4147        log.debug("Threads: " + str(threads))
4148
4149        # Config
4150        config = self.get_config()
4151        log.debug("Config: " + str(config))
4152
4153        # Config - Folders - Databases
4154        databases_folders = (
4155            config.get("folders", {})
4156            .get("databases", {})
4157            .get("exomiser", f"{DEFAULT_DATABASE_FOLDER}/exomiser/current")
4158        )
4159        databases_folders = full_path(databases_folders)
4160        if not os.path.exists(databases_folders):
4161            log.error(f"Databases annotations: {databases_folders} NOT found")
4162        log.debug("Databases annotations: " + str(databases_folders))
4163
4164        # Config - Exomiser
4165        exomiser_bin_command = get_bin_command(
4166            bin="exomiser-cli*.jar",
4167            tool="exomiser",
4168            bin_type="jar",
4169            config=config,
4170            default_folder=f"{DEFAULT_TOOLS_FOLDER}/exomiser",
4171        )
4172        log.debug("Exomiser bin command: " + str(exomiser_bin_command))
4173        if not exomiser_bin_command:
4174            msg_err = f"Annotation failed: no exomiser bin '{exomiser_bin_command}'"
4175            log.error(msg_err)
4176            raise ValueError(msg_err)
4177
4178        # Param
4179        param = self.get_param()
4180        log.debug("Param: " + str(param))
4181
4182        # Param - Exomiser
4183        param_exomiser = param.get("annotation", {}).get("exomiser", {})
4184        log.debug(f"Param Exomiser: {param_exomiser}")
4185
4186        # Param - Assembly
4187        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4188        log.debug("Assembly: " + str(assembly))
4189
4190        # Data
4191        table_variants = self.get_table_variants()
4192
4193        # Check if not empty
4194        log.debug("Check if not empty")
4195        sql_query_chromosomes = (
4196            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4197        )
4198        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4199            log.info(f"VCF empty")
4200            return False
4201
4202        # VCF header
4203        vcf_reader = self.get_header()
4204        log.debug("Initial header: " + str(vcf_reader.infos))
4205
4206        # Samples
4207        samples = self.get_header_sample_list()
4208        if not samples:
4209            log.error("No Samples in VCF")
4210            return False
4211        log.debug(f"Samples: {samples}")
4212
4213        # Memory limit
4214        memory_limit = self.get_memory("8G")
4215        log.debug(f"memory_limit: {memory_limit}")
4216
4217        # Exomiser java options
4218        exomiser_java_options = (
4219            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4220        )
4221        log.debug(f"Exomiser java options: {exomiser_java_options}")
4222
4223        # Download Exomiser (if not exists)
4224        exomiser_release = param_exomiser.get("release", None)
4225        exomiser_application_properties = param_exomiser.get(
4226            "exomiser_application_properties", None
4227        )
4228        databases_download_exomiser(
4229            assemblies=[assembly],
4230            exomiser_folder=databases_folders,
4231            exomiser_release=exomiser_release,
4232            exomiser_phenotype_release=exomiser_release,
4233            exomiser_application_properties=exomiser_application_properties,
4234        )
4235
4236        # Force annotation
4237        force_update_annotation = True
4238
4239        if "Exomiser" not in self.get_header().infos or force_update_annotation:
4240            log.debug("Start annotation Exomiser")
4241
4242            with TemporaryDirectory(dir=self.get_tmp_dir()) as tmp_dir:
4243
4244                # tmp_dir = "/tmp/exomiser"
4245
4246                ### ANALYSIS ###
4247                ################
4248
4249                # Create analysis.json through analysis dict
4250                # either analysis in param or by default
4251                # depending on preset exome/genome)
4252
4253                # Init analysis dict
4254                param_exomiser_analysis_dict = {}
4255
4256                # analysis from param
4257                param_exomiser_analysis = param_exomiser.get("analysis", {})
4258                param_exomiser_analysis = full_path(param_exomiser_analysis)
4259
4260                # If analysis in param -> load anlaysis json
4261                if param_exomiser_analysis:
4262
4263                    # If param analysis is a file and exists
4264                    if isinstance(param_exomiser_analysis, str) and os.path.exists(
4265                        param_exomiser_analysis
4266                    ):
4267                        # Load analysis file into analysis dict (either yaml or json)
4268                        with open(param_exomiser_analysis) as json_file:
4269                            param_exomiser_analysis_dict = yaml.safe_load(json_file)
4270
4271                    # If param analysis is a dict
4272                    elif isinstance(param_exomiser_analysis, dict):
4273                        # Load analysis dict into analysis dict (either yaml or json)
4274                        param_exomiser_analysis_dict = param_exomiser_analysis
4275
4276                    # Error analysis type
4277                    else:
4278                        log.error(f"Analysis type unknown. Check param file.")
4279                        raise ValueError(f"Analysis type unknown. Check param file.")
4280
4281                # Case no input analysis config file/dict
4282                # Use preset (exome/genome) to open default config file
4283                if not param_exomiser_analysis_dict:
4284
4285                    # default preset
4286                    default_preset = "exome"
4287
4288                    # Get param preset or default preset
4289                    param_exomiser_preset = param_exomiser.get("preset", default_preset)
4290
4291                    # Try to find if preset is a file
4292                    if os.path.exists(param_exomiser_preset):
4293                        # Preset file is provided in full path
4294                        param_exomiser_analysis_default_config_file = (
4295                            param_exomiser_preset
4296                        )
4297                    # elif os.path.exists(full_path(param_exomiser_preset)):
4298                    #     # Preset file is provided in full path
4299                    #     param_exomiser_analysis_default_config_file = full_path(param_exomiser_preset)
4300                    elif os.path.exists(
4301                        os.path.join(folder_config, param_exomiser_preset)
4302                    ):
4303                        # Preset file is provided a basename in config folder (can be a path with subfolders)
4304                        param_exomiser_analysis_default_config_file = os.path.join(
4305                            folder_config, param_exomiser_preset
4306                        )
4307                    else:
4308                        # Construct preset file
4309                        param_exomiser_analysis_default_config_file = os.path.join(
4310                            folder_config,
4311                            f"preset-{param_exomiser_preset}-analysis.json",
4312                        )
4313
4314                    # If preset file exists
4315                    param_exomiser_analysis_default_config_file = full_path(
4316                        param_exomiser_analysis_default_config_file
4317                    )
4318                    if os.path.exists(param_exomiser_analysis_default_config_file):
4319                        # Load prest file into analysis dict (either yaml or json)
4320                        with open(
4321                            param_exomiser_analysis_default_config_file
4322                        ) as json_file:
4323                            # param_exomiser_analysis_dict[""] = json.load(json_file)
4324                            param_exomiser_analysis_dict["analysis"] = yaml.safe_load(
4325                                json_file
4326                            )
4327
4328                    # Error preset file
4329                    else:
4330                        log.error(
4331                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4332                        )
4333                        raise ValueError(
4334                            f"No analysis preset config file ({param_exomiser_analysis_default_config_file})"
4335                        )
4336
4337                # If no analysis dict created
4338                if not param_exomiser_analysis_dict:
4339                    log.error(f"No analysis config")
4340                    raise ValueError(f"No analysis config")
4341
4342                # Log
4343                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4344
4345                ### PHENOPACKET ###
4346                ###################
4347
4348                # If no PhenoPacket in analysis dict -> check in param
4349                if "phenopacket" not in param_exomiser_analysis_dict:
4350
4351                    # If PhenoPacket in param -> load anlaysis json
4352                    if param_exomiser.get("phenopacket", None):
4353
4354                        param_exomiser_phenopacket = param_exomiser.get("phenopacket")
4355                        param_exomiser_phenopacket = full_path(
4356                            param_exomiser_phenopacket
4357                        )
4358
4359                        # If param phenopacket is a file and exists
4360                        if isinstance(
4361                            param_exomiser_phenopacket, str
4362                        ) and os.path.exists(param_exomiser_phenopacket):
4363                            # Load phenopacket file into analysis dict (either yaml or json)
4364                            with open(param_exomiser_phenopacket) as json_file:
4365                                param_exomiser_analysis_dict["phenopacket"] = (
4366                                    yaml.safe_load(json_file)
4367                                )
4368
4369                        # If param phenopacket is a dict
4370                        elif isinstance(param_exomiser_phenopacket, dict):
4371                            # Load phenopacket dict into analysis dict (either yaml or json)
4372                            param_exomiser_analysis_dict["phenopacket"] = (
4373                                param_exomiser_phenopacket
4374                            )
4375
4376                        # Error phenopacket type
4377                        else:
4378                            log.error(f"Phenopacket type unknown. Check param file.")
4379                            raise ValueError(
4380                                f"Phenopacket type unknown. Check param file."
4381                            )
4382
4383                # If no PhenoPacket in analysis dict -> construct from sample and HPO in param
4384                if "phenopacket" not in param_exomiser_analysis_dict:
4385
4386                    # Init PhenoPacket
4387                    param_exomiser_analysis_dict["phenopacket"] = {
4388                        "id": "analysis",
4389                        "proband": {},
4390                    }
4391
4392                    ### Add subject ###
4393
4394                    # If subject exists
4395                    param_exomiser_subject = param_exomiser.get("subject", {})
4396
4397                    # If subject not exists -> found sample ID
4398                    if not param_exomiser_subject:
4399
4400                        # Found sample ID in param
4401                        sample = param_exomiser.get("sample", None)
4402
4403                        # Find sample ID (first sample)
4404                        if not sample:
4405                            sample_list = self.get_header_sample_list()
4406                            if len(sample_list) > 0:
4407                                sample = sample_list[0]
4408                            else:
4409                                log.error(f"No sample found")
4410                                raise ValueError(f"No sample found")
4411
4412                        # Create subject
4413                        param_exomiser_subject = {"id": sample, "sex": "UNKNOWN_SEX"}
4414
4415                    # Add to dict
4416                    param_exomiser_analysis_dict["phenopacket"][
4417                        "subject"
4418                    ] = param_exomiser_subject
4419
4420                    ### Add "phenotypicFeatures" ###
4421
4422                    # If phenotypicFeatures exists
4423                    param_exomiser_phenotypicfeatures = param_exomiser.get(
4424                        "phenotypicFeatures", []
4425                    )
4426
4427                    # If phenotypicFeatures not exists -> Try to infer from hpo list
4428                    if not param_exomiser_phenotypicfeatures:
4429
4430                        # Found HPO in param
4431                        param_exomiser_hpo = param_exomiser.get("hpo", [])
4432
4433                        # Split HPO if list in string format separated by comma
4434                        if isinstance(param_exomiser_hpo, str):
4435                            param_exomiser_hpo = param_exomiser_hpo.split(",")
4436
4437                        # Create HPO list
4438                        for hpo in param_exomiser_hpo:
4439                            hpo_clean = re.sub("[^0-9]", "", hpo)
4440                            param_exomiser_phenotypicfeatures.append(
4441                                {
4442                                    "type": {
4443                                        "id": f"HP:{hpo_clean}",
4444                                        "label": f"HP:{hpo_clean}",
4445                                    }
4446                                }
4447                            )
4448
4449                    # Add to dict
4450                    param_exomiser_analysis_dict["phenopacket"][
4451                        "phenotypicFeatures"
4452                    ] = param_exomiser_phenotypicfeatures
4453
4454                    # If phenotypicFeatures not exists -> Remove hiPhivePrioritiser step
4455                    if not param_exomiser_phenotypicfeatures:
4456                        for step in param_exomiser_analysis_dict.get(
4457                            "analysis", {}
4458                        ).get("steps", []):
4459                            if "hiPhivePrioritiser" in step:
4460                                param_exomiser_analysis_dict.get("analysis", {}).get(
4461                                    "steps", []
4462                                ).remove(step)
4463
4464                ### Add Input File ###
4465
4466                # Initial file name and htsFiles
4467                tmp_vcf_name = os.path.join(tmp_dir, "initial.vcf.gz")
4468                param_exomiser_analysis_dict["phenopacket"]["htsFiles"] = [
4469                    {
4470                        "uri": tmp_vcf_name,
4471                        "htsFormat": "VCF",
4472                        "genomeAssembly": assembly,
4473                    }
4474                ]
4475
4476                ### Add metaData ###
4477
4478                # If metaData not in analysis dict
4479                if "metaData" not in param_exomiser_analysis_dict:
4480                    param_exomiser_analysis_dict["phenopacket"]["metaData"] = {
4481                        "created": f"{datetime.datetime.now()}".replace(" ", "T") + "Z",
4482                        "createdBy": "howard",
4483                        "phenopacketSchemaVersion": 1,
4484                    }
4485
4486                ### OutputOptions ###
4487
4488                # Init output result folder
4489                output_results = os.path.join(tmp_dir, "results")
4490
4491                # If no outputOptions in analysis dict
4492                if "outputOptions" not in param_exomiser_analysis_dict:
4493
4494                    # default output formats
4495                    defaut_output_formats = ["TSV_VARIANT", "VCF"]
4496
4497                    # Get outputOptions in param
4498                    output_options = param_exomiser.get("outputOptions", None)
4499
4500                    # If no output_options in param -> check
4501                    if not output_options:
4502                        output_options = {
4503                            "outputContributingVariantsOnly": False,
4504                            "numGenes": 0,
4505                            "outputFormats": defaut_output_formats,
4506                        }
4507
4508                    # Replace outputDirectory in output options
4509                    output_options["outputDirectory"] = output_results
4510                    output_options["outputFileName"] = "howard"
4511
4512                    # Add outputOptions in analysis dict
4513                    param_exomiser_analysis_dict["outputOptions"] = output_options
4514
4515                else:
4516
4517                    # Replace output_results and output format (if exists in param)
4518                    param_exomiser_analysis_dict["outputOptions"][
4519                        "outputDirectory"
4520                    ] = output_results
4521                    param_exomiser_analysis_dict["outputOptions"]["outputFormats"] = (
4522                        list(
4523                            set(
4524                                param_exomiser_analysis_dict.get(
4525                                    "outputOptions", {}
4526                                ).get("outputFormats", [])
4527                                + ["TSV_VARIANT", "VCF"]
4528                            )
4529                        )
4530                    )
4531
4532                # log
4533                log.debug(f"Pre analysis dict: {param_exomiser_analysis_dict}")
4534
4535                ### ANALYSIS FILE ###
4536                #####################
4537
4538                ### Full JSON analysis config file ###
4539
4540                exomiser_analysis = os.path.join(tmp_dir, "analysis.json")
4541                with open(exomiser_analysis, "w") as fp:
4542                    json.dump(param_exomiser_analysis_dict, fp, indent=4)
4543
4544                ### SPLIT analysis and sample config files
4545
4546                # Splitted analysis dict
4547                param_exomiser_analysis_dict_for_split = (
4548                    param_exomiser_analysis_dict.copy()
4549                )
4550
4551                # Phenopacket JSON file
4552                exomiser_analysis_phenopacket = os.path.join(
4553                    tmp_dir, "analysis_phenopacket.json"
4554                )
4555                with open(exomiser_analysis_phenopacket, "w") as fp:
4556                    json.dump(
4557                        param_exomiser_analysis_dict_for_split.get("phenopacket"),
4558                        fp,
4559                        indent=4,
4560                    )
4561
4562                # Analysis JSON file without Phenopacket parameters
4563                param_exomiser_analysis_dict_for_split.pop("phenopacket")
4564                exomiser_analysis_analysis = os.path.join(
4565                    tmp_dir, "analysis_analysis.json"
4566                )
4567                with open(exomiser_analysis_analysis, "w") as fp:
4568                    json.dump(param_exomiser_analysis_dict_for_split, fp, indent=4)
4569
4570                ### INITAL VCF file ###
4571                #######################
4572
4573                ### Create list of samples to use and include inti initial VCF file ####
4574
4575                # Subject (main sample)
4576                # Get sample ID in analysis dict
4577                sample_subject = (
4578                    param_exomiser_analysis_dict.get("phenopacket", {})
4579                    .get("subject", {})
4580                    .get("id", None)
4581                )
4582                sample_proband = (
4583                    param_exomiser_analysis_dict.get("phenopacket", {})
4584                    .get("proband", {})
4585                    .get("subject", {})
4586                    .get("id", None)
4587                )
4588                sample = []
4589                if sample_subject:
4590                    sample.append(sample_subject)
4591                if sample_proband:
4592                    sample.append(sample_proband)
4593
4594                # Get sample ID within Pedigree
4595                pedigree_persons_list = (
4596                    param_exomiser_analysis_dict.get("phenopacket", {})
4597                    .get("pedigree", {})
4598                    .get("persons", {})
4599                )
4600
4601                # Create list with all sample ID in pedigree (if exists)
4602                pedigree_persons = []
4603                for person in pedigree_persons_list:
4604                    pedigree_persons.append(person.get("individualId"))
4605
4606                # Concat subject sample ID and samples ID in pedigreesamples
4607                samples = list(set(sample + pedigree_persons))
4608
4609                # Check if sample list is not empty
4610                if not samples:
4611                    log.error(f"No samples found")
4612                    raise ValueError(f"No samples found")
4613
4614                # Create VCF with sample (either sample in param or first one by default)
4615                # Export VCF file
4616                self.export_variant_vcf(
4617                    vcf_file=tmp_vcf_name,
4618                    remove_info=True,
4619                    add_samples=True,
4620                    list_samples=samples,
4621                    index=False,
4622                )
4623
4624                ### Execute Exomiser ###
4625                ########################
4626
4627                # Init command
4628                exomiser_command = ""
4629
4630                # Command exomiser options
4631                exomiser_options = f" --spring.config.location={databases_folders}/{assembly}/application.properties --exomiser.data-directory={databases_folders}/{assembly} "
4632
4633                # Release
4634                exomiser_release = param_exomiser.get("release", None)
4635                if exomiser_release:
4636                    # phenotype data version
4637                    exomiser_options += (
4638                        f" --exomiser.phenotype.data-version={exomiser_release} "
4639                    )
4640                    # data version
4641                    exomiser_options += (
4642                        f" --exomiser.{assembly}.data-version={exomiser_release} "
4643                    )
4644                    # variant white list
4645                    variant_white_list_file = (
4646                        f"{exomiser_release}_{assembly}_clinvar_whitelist.tsv.gz"
4647                    )
4648                    if os.path.exists(
4649                        os.path.join(
4650                            databases_folders, assembly, variant_white_list_file
4651                        )
4652                    ):
4653                        exomiser_options += f" --exomiser.{assembly}.variant-white-list-path={variant_white_list_file} "
4654
4655                # transcript_source
4656                transcript_source = param_exomiser.get(
4657                    "transcript_source", None
4658                )  # ucsc, refseq, ensembl
4659                if transcript_source:
4660                    exomiser_options += (
4661                        f" --exomiser.{assembly}.transcript-source={transcript_source} "
4662                    )
4663
4664                # If analysis contain proband param
4665                if param_exomiser_analysis_dict.get("phenopacket", {}).get(
4666                    "proband", {}
4667                ):
4668                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis_analysis} --sample={exomiser_analysis_phenopacket} {exomiser_options} "
4669
4670                # If no proband (usually uniq sample)
4671                else:
4672                    exomiser_command_analysis = f" {exomiser_bin_command} --analysis={exomiser_analysis} {exomiser_options}"
4673
4674                # Log
4675                log.debug(f"exomiser_command_analysis={exomiser_command_analysis}")
4676
4677                # Run command
4678                result = subprocess.call(
4679                    exomiser_command_analysis.split(), stdout=subprocess.PIPE
4680                )
4681                if result:
4682                    log.error("Exomiser command failed")
4683                    raise ValueError("Exomiser command failed")
4684
4685                ### RESULTS ###
4686                ###############
4687
4688                ### Annotate with TSV fields ###
4689
4690                # Init result tsv file
4691                exomiser_to_info = param_exomiser.get("exomiser_to_info", False)
4692
4693                # Init result tsv file
4694                output_results_tsv = os.path.join(output_results, "howard.variants.tsv")
4695
4696                # Parse TSV file and explode columns in INFO field
4697                if exomiser_to_info and os.path.exists(output_results_tsv):
4698
4699                    # Log
4700                    log.debug("Exomiser columns to VCF INFO field")
4701
4702                    # Retrieve columns and types
4703                    query = f""" SELECT * FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) LIMIT 0 """
4704                    output_results_tsv_df = self.get_query_to_df(query)
4705                    output_results_tsv_columns = output_results_tsv_df.columns.tolist()
4706
4707                    # Init concat fields for update
4708                    sql_query_update_concat_fields = []
4709
4710                    # Fields to avoid
4711                    fields_to_avoid = [
4712                        "CONTIG",
4713                        "START",
4714                        "END",
4715                        "REF",
4716                        "ALT",
4717                        "QUAL",
4718                        "FILTER",
4719                        "GENOTYPE",
4720                    ]
4721
4722                    # List all columns to add into header
4723                    for header_column in output_results_tsv_columns:
4724
4725                        # If header column is enable
4726                        if header_column not in fields_to_avoid:
4727
4728                            # Header info type
4729                            header_info_type = "String"
4730                            header_column_df = output_results_tsv_df[header_column]
4731                            header_column_df_dtype = header_column_df.dtype
4732                            if header_column_df_dtype == object:
4733                                if (
4734                                    pd.to_numeric(header_column_df, errors="coerce")
4735                                    .notnull()
4736                                    .all()
4737                                ):
4738                                    header_info_type = "Float"
4739                            else:
4740                                header_info_type = "Integer"
4741
4742                            # Header info
4743                            characters_to_validate = ["-"]
4744                            pattern = "[" + "".join(characters_to_validate) + "]"
4745                            header_info_name = re.sub(
4746                                pattern,
4747                                "_",
4748                                f"Exomiser_{header_column}".replace("#", ""),
4749                            )
4750                            header_info_number = "."
4751                            header_info_description = (
4752                                f"Exomiser {header_column} annotation"
4753                            )
4754                            header_info_source = "Exomiser"
4755                            header_info_version = "unknown"
4756                            header_info_code = CODE_TYPE_MAP[header_info_type]
4757                            vcf_reader.infos[header_info_name] = vcf.parser._Info(
4758                                header_info_name,
4759                                header_info_number,
4760                                header_info_type,
4761                                header_info_description,
4762                                header_info_source,
4763                                header_info_version,
4764                                header_info_code,
4765                            )
4766
4767                            # Add field to add for update to concat fields
4768                            sql_query_update_concat_fields.append(
4769                                f"""
4770                                CASE
4771                                    WHEN table_parquet."{header_column}" NOT IN ('','.')
4772                                    THEN concat(
4773                                        '{header_info_name}=',
4774                                        table_parquet."{header_column}",
4775                                        ';'
4776                                        )
4777
4778                                    ELSE ''
4779                                END
4780                            """
4781                            )
4782
4783                    # Update query
4784                    sql_query_update = f"""
4785                        UPDATE {table_variants} as table_variants
4786                            SET INFO = concat(
4787                                            CASE
4788                                                WHEN INFO NOT IN ('', '.')
4789                                                THEN INFO
4790                                                ELSE ''
4791                                            END,
4792                                            CASE
4793                                                WHEN table_variants.INFO NOT IN ('','.')
4794                                                THEN ';'
4795                                                ELSE ''
4796                                            END,
4797                                            (
4798                                            SELECT 
4799                                                concat(
4800                                                    {",".join(sql_query_update_concat_fields)}
4801                                                )
4802                                            FROM read_csv('{output_results_tsv}', auto_detect=True, delim='\t', sample_size=-1) as table_parquet
4803                                                    WHERE concat('chr', CAST(table_parquet.\"CONTIG\" AS STRING)) = table_variants.\"#CHROM\"
4804                                                    AND table_parquet.\"START\" = table_variants.\"POS\"
4805                                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
4806                                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
4807                                            )
4808                                        )
4809                            ;
4810                        """
4811
4812                    # Update
4813                    self.conn.execute(sql_query_update)
4814
4815                ### Annotate with VCF INFO field ###
4816
4817                # Init result VCF file
4818                output_results_vcf = os.path.join(output_results, "howard.vcf.gz")
4819
4820                # If VCF exists
4821                if os.path.exists(output_results_vcf):
4822
4823                    # Log
4824                    log.debug("Exomiser result VCF update variants")
4825
4826                    # Find Exomiser INFO field annotation in header
4827                    with gzip.open(output_results_vcf, "rt") as f:
4828                        header_list = self.read_vcf_header(f)
4829                    exomiser_vcf_header = vcf.Reader(
4830                        io.StringIO("\n".join(header_list))
4831                    )
4832
4833                    # Add annotation INFO field to header
4834                    vcf_reader.infos["Exomiser"] = exomiser_vcf_header.infos["Exomiser"]
4835
4836                    # Update variants with VCF
4837                    self.update_from_vcf(output_results_vcf)
4838
4839        return True

This function annotate with Exomiser

This function uses args as parameters, in section "annotation" -> "exomiser", with sections:

  • "analysis" (dict/file): Full analysis dictionnary parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. These parameters may change depending on other parameters (e.g. phenotipicFeatures/HPO) Default : None
  • "preset" (string): Analysis preset (available in config folder). Used if no full "analysis" is provided. Default: "exome"
  • "phenopacket" (dict/file): Samples and phenotipic features parameters (see Exomiser docs). Either a dict, or a file in JSON or YAML format. Default: None
  • "subject" (dict): Sample parameters (see Exomiser docs). Example: "subject": { "id": "ISDBM322017", "sex": "FEMALE" } Default: None
  • "sample" (string): Sample name to construct "subject" section: "subject": { "id": "", "sex": "UNKNOWN_SEX" } Default: None
  • "phenotypicFeatures" (dict) Phenotypic features to construct "subject" section. Example: "phenotypicFeatures": [ { "type": { "id": "HP:0001159", "label": "Syndactyly" } }, { "type": { "id": "HP:0000486", "label": "Strabismus" } } ]
  • "hpo" (list) List of HPO ids as phenotypic features. Example: "hpo": ['0001156', '0001363', '0011304', '0010055'] Default: []
  • "outputOptions" (dict): Output options (see Exomiser docs). Default: "output_options" = { "outputContributingVariantsOnly": False, "numGenes": 0, "outputFormats": ["TSV_VARIANT", "VCF"] }
  • "transcript_source" (string): Transcript source (either "refseq", "ucsc", "ensembl") Default: "refseq"
  • "exomiser_to_info" (boolean): Add exomiser TSV file columns as INFO fields in VCF. Default: False
  • "release" (string): Exomise database release. If not exists, database release will be downloaded (take a while). Default: None (provided by application.properties configuration file)
  • "exomiser_application_properties" (file): Exomiser configuration file (see Exomiser docs). Useful to automatically download databases (especially for specific genome databases).

Notes:

  • If no sample in parameters, first sample in VCF will be chosen
  • If no HPO found, "hiPhivePrioritiser" analysis step will be switch off
Parameters
  • threads: The number of threads to use
Returns

None.

def annotation_snpeff(self, threads: int = None) -> None:
4841    def annotation_snpeff(self, threads: int = None) -> None:
4842        """
4843        This function annotate with snpEff
4844
4845        :param threads: The number of threads to use
4846        :return: the value of the variable "return_value".
4847        """
4848
4849        # DEBUG
4850        log.debug("Start annotation with snpeff databases")
4851
4852        # Threads
4853        if not threads:
4854            threads = self.get_threads()
4855        log.debug("Threads: " + str(threads))
4856
4857        # DEBUG
4858        delete_tmp = True
4859        if self.get_config().get("verbosity", "warning") in ["debug"]:
4860            delete_tmp = False
4861            log.debug("Delete tmp files/folders: " + str(delete_tmp))
4862
4863        # Config
4864        config = self.get_config()
4865        log.debug("Config: " + str(config))
4866
4867        # Config - Folders - Databases
4868        databases_folders = (
4869            config.get("folders", {}).get("databases", {}).get("snpeff", ["."])
4870        )
4871        log.debug("Databases annotations: " + str(databases_folders))
4872
4873        # # Config - Java
4874        # java_bin = get_bin(
4875        #     tool="java",
4876        #     bin="java",
4877        #     bin_type="bin",
4878        #     config=config,
4879        #     default_folder="/usr/bin",
4880        # )
4881        # if not (os.path.exists(java_bin) or (java_bin and which(java_bin))):
4882        #     log.error(f"Annotation failed: no java bin '{java_bin}'")
4883        #     raise ValueError(f"Annotation failed: no java bin '{java_bin}'")
4884
4885        # # Config - snpEff bin
4886        # snpeff_jar = get_bin(
4887        #     tool="snpeff",
4888        #     bin="snpEff.jar",
4889        #     bin_type="jar",
4890        #     config=config,
4891        #     default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4892        # )
4893        # if not (os.path.exists(snpeff_jar) or (snpeff_jar and which(snpeff_jar))):
4894        #     log.error(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4895        #     raise ValueError(f"Annotation failed: no snpEff jar '{snpeff_jar}'")
4896
4897        # Config - snpEff bin command
4898        snpeff_bin_command = get_bin_command(
4899            bin="snpEff.jar",
4900            tool="snpeff",
4901            bin_type="jar",
4902            config=config,
4903            default_folder=f"{DEFAULT_TOOLS_FOLDER}/snpeff",
4904        )
4905        if not snpeff_bin_command:
4906            msg_err = f"Annotation failed: no snpeff bin '{snpeff_bin_command}'"
4907            log.error(msg_err)
4908            raise ValueError(msg_err)
4909
4910        # Config - snpEff databases
4911        snpeff_databases = (
4912            config.get("folders", {})
4913            .get("databases", {})
4914            .get("snpeff", DEFAULT_SNPEFF_FOLDER)
4915        )
4916        snpeff_databases = full_path(snpeff_databases)
4917        if snpeff_databases is not None and snpeff_databases != "":
4918            log.debug(f"Create snpEff databases folder")
4919            if not os.path.exists(snpeff_databases):
4920                os.makedirs(snpeff_databases)
4921
4922        # Param
4923        param = self.get_param()
4924        log.debug("Param: " + str(param))
4925
4926        # Param
4927        options = param.get("annotation", {}).get("snpeff", {}).get("options", None)
4928        log.debug("Options: " + str(options))
4929
4930        # Param - Assembly
4931        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
4932
4933        # Param - Options
4934        snpeff_options = (
4935            param.get("annotation", {}).get("snpeff", {}).get("options", "")
4936        )
4937        snpeff_stats = param.get("annotation", {}).get("snpeff", {}).get("stats", None)
4938        snpeff_csvstats = (
4939            param.get("annotation", {}).get("snpeff", {}).get("csvStats", None)
4940        )
4941        if snpeff_stats:
4942            snpeff_stats = snpeff_stats.replace("OUTPUT", self.get_output())
4943            snpeff_stats = full_path(snpeff_stats)
4944            snpeff_options += f" -stats {snpeff_stats}"
4945        if snpeff_csvstats:
4946            snpeff_csvstats = snpeff_csvstats.replace("OUTPUT", self.get_output())
4947            snpeff_csvstats = full_path(snpeff_csvstats)
4948            snpeff_options += f" -csvStats {snpeff_csvstats}"
4949
4950        # Data
4951        table_variants = self.get_table_variants()
4952
4953        # Check if not empty
4954        log.debug("Check if not empty")
4955        sql_query_chromosomes = (
4956            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
4957        )
4958        # if not self.conn.execute(f"{sql_query_chromosomes}").df()["count"][0]:
4959        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
4960            log.info(f"VCF empty")
4961            return
4962
4963        # Export in VCF
4964        log.debug("Create initial file to annotate")
4965        tmp_vcf = NamedTemporaryFile(
4966            prefix=self.get_prefix(),
4967            dir=self.get_tmp_dir(),
4968            suffix=".vcf.gz",
4969            delete=True,
4970        )
4971        tmp_vcf_name = tmp_vcf.name
4972
4973        # VCF header
4974        vcf_reader = self.get_header()
4975        log.debug("Initial header: " + str(vcf_reader.infos))
4976
4977        # Existing annotations
4978        for vcf_annotation in self.get_header().infos:
4979
4980            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
4981            log.debug(
4982                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
4983            )
4984
4985        # Memory limit
4986        # if config.get("memory", None):
4987        #     memory_limit = config.get("memory", "8G")
4988        # else:
4989        #     memory_limit = "8G"
4990        memory_limit = self.get_memory("8G")
4991        log.debug(f"memory_limit: {memory_limit}")
4992
4993        # snpEff java options
4994        snpeff_java_options = (
4995            f" -Xmx{memory_limit} -XX:+UseParallelGC -XX:ParallelGCThreads={threads} "
4996        )
4997        log.debug(f"Exomiser java options: {snpeff_java_options}")
4998
4999        force_update_annotation = True
5000
5001        if "ANN" not in self.get_header().infos or force_update_annotation:
5002
5003            # Check snpEff database
5004            log.debug(f"Check snpEff databases {[assembly]}")
5005            databases_download_snpeff(
5006                folder=snpeff_databases, assemblies=[assembly], config=config
5007            )
5008
5009            # Export VCF file
5010            self.export_variant_vcf(
5011                vcf_file=tmp_vcf_name,
5012                remove_info=True,
5013                add_samples=False,
5014                index=True,
5015            )
5016
5017            # Tmp file
5018            err_files = []
5019            tmp_annotate_vcf = NamedTemporaryFile(
5020                prefix=self.get_prefix(),
5021                dir=self.get_tmp_dir(),
5022                suffix=".vcf",
5023                delete=False,
5024            )
5025            tmp_annotate_vcf_name = tmp_annotate_vcf.name
5026            tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5027            err_files.append(tmp_annotate_vcf_name_err)
5028
5029            # Command
5030            snpeff_command = f"{snpeff_bin_command} {assembly} -dataDir {snpeff_databases} {snpeff_options} {tmp_vcf_name} 1>{tmp_annotate_vcf_name} 2>>{tmp_annotate_vcf_name_err}"
5031            log.debug(f"Annotation - snpEff command: {snpeff_command}")
5032            run_parallel_commands([snpeff_command], 1)
5033
5034            # Error messages
5035            log.info(f"Error/Warning messages:")
5036            error_message_command_all = []
5037            error_message_command_warning = []
5038            error_message_command_err = []
5039            for err_file in err_files:
5040                with open(err_file, "r") as f:
5041                    for line in f:
5042                        message = line.strip()
5043                        error_message_command_all.append(message)
5044                        if line.startswith("[W::"):
5045                            error_message_command_warning.append(message)
5046                        if line.startswith("[E::"):
5047                            error_message_command_err.append(f"{err_file}: " + message)
5048            # log info
5049            for message in list(
5050                set(error_message_command_err + error_message_command_warning)
5051            ):
5052                log.info(f"   {message}")
5053            # debug info
5054            for message in list(set(error_message_command_all)):
5055                log.debug(f"   {message}")
5056            # failed
5057            if len(error_message_command_err):
5058                log.error("Annotation failed: Error in commands")
5059                raise ValueError("Annotation failed: Error in commands")
5060
5061            # Find annotation in header
5062            with open(tmp_annotate_vcf_name, "rt") as f:
5063                header_list = self.read_vcf_header(f)
5064            annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5065
5066            for ann in annovar_vcf_header.infos:
5067                if ann not in self.get_header().infos:
5068                    vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5069
5070            # Update variants
5071            log.info(f"Annotation - Updating...")
5072            self.update_from_vcf(tmp_annotate_vcf_name)
5073
5074        else:
5075            if "ANN" in self.get_header().infos:
5076                log.debug(f"Existing snpEff annotations in VCF")
5077            if force_update_annotation:
5078                log.debug(f"Existing snpEff annotations in VCF - annotation forced")

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def annotation_annovar(self, threads: int = None) -> None:
5080    def annotation_annovar(self, threads: int = None) -> None:
5081        """
5082        It takes a VCF file, annotates it with Annovar, and then updates the database with the new
5083        annotations
5084
5085        :param threads: number of threads to use
5086        :return: the value of the variable "return_value".
5087        """
5088
5089        # DEBUG
5090        log.debug("Start annotation with Annovar databases")
5091
5092        # Threads
5093        if not threads:
5094            threads = self.get_threads()
5095        log.debug("Threads: " + str(threads))
5096
5097        # Tmp en Err files
5098        tmp_files = []
5099        err_files = []
5100
5101        # DEBUG
5102        delete_tmp = True
5103        if self.get_config().get("verbosity", "warning") in ["debug"]:
5104            delete_tmp = False
5105            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5106
5107        # Config
5108        config = self.get_config()
5109        log.debug("Config: " + str(config))
5110
5111        # Config - Folders - Databases
5112        databases_folders = (
5113            config.get("folders", {}).get("databases", {}).get("annovar", ["."])
5114        )
5115        log.debug("Databases annotations: " + str(databases_folders))
5116
5117        # Config - annovar bin command
5118        annovar_bin_command = get_bin_command(
5119            bin="table_annovar.pl",
5120            tool="annovar",
5121            bin_type="perl",
5122            config=config,
5123            default_folder=f"{DEFAULT_TOOLS_FOLDER}/annovar",
5124        )
5125        if not annovar_bin_command:
5126            msg_err = f"Annotation failed: no annovar bin '{annovar_bin_command}'"
5127            log.error(msg_err)
5128            raise ValueError(msg_err)
5129
5130        # Config - BCFTools bin command
5131        bcftools_bin_command = get_bin_command(
5132            bin="bcftools",
5133            tool="bcftools",
5134            bin_type="bin",
5135            config=config,
5136            default_folder=f"{DEFAULT_TOOLS_FOLDER}/bcftools",
5137        )
5138        if not bcftools_bin_command:
5139            msg_err = f"Annotation failed: no bcftools bin '{bcftools_bin_command}'"
5140            log.error(msg_err)
5141            raise ValueError(msg_err)
5142
5143        # Config - annovar databases
5144        annovar_databases = (
5145            config.get("folders", {})
5146            .get("databases", {})
5147            .get("annovar", DEFAULT_ANNOVAR_FOLDER)
5148        )
5149        annovar_databases = full_path(annovar_databases)
5150        if annovar_databases != "" and not os.path.exists(annovar_databases):
5151            os.makedirs(annovar_databases)
5152
5153        # Param
5154        param = self.get_param()
5155        log.debug("Param: " + str(param))
5156
5157        # Param - options
5158        options = param.get("annotation", {}).get("annovar", {}).get("options", {})
5159        log.debug("Options: " + str(options))
5160
5161        # Param - annotations
5162        annotations = (
5163            param.get("annotation", {}).get("annovar", {}).get("annotations", {})
5164        )
5165        log.debug("Annotations: " + str(annotations))
5166
5167        # Param - Assembly
5168        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
5169
5170        # Annovar database assembly
5171        annovar_databases_assembly = f"{annovar_databases}/{assembly}"
5172        if annovar_databases_assembly != "" and not os.path.exists(
5173            annovar_databases_assembly
5174        ):
5175            os.makedirs(annovar_databases_assembly)
5176
5177        # Data
5178        table_variants = self.get_table_variants()
5179
5180        # Check if not empty
5181        log.debug("Check if not empty")
5182        sql_query_chromosomes = (
5183            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
5184        )
5185        sql_query_chromosomes_df = self.get_query_to_df(sql_query_chromosomes)
5186        if not sql_query_chromosomes_df["count"][0]:
5187            log.info(f"VCF empty")
5188            return
5189
5190        # VCF header
5191        vcf_reader = self.get_header()
5192        log.debug("Initial header: " + str(vcf_reader.infos))
5193
5194        # Existing annotations
5195        for vcf_annotation in self.get_header().infos:
5196
5197            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5198            log.debug(
5199                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5200            )
5201
5202        force_update_annotation = True
5203
5204        if annotations:
5205
5206            commands = []
5207            tmp_annotates_vcf_name_list = []
5208
5209            # Export in VCF
5210            log.debug("Create initial file to annotate")
5211            tmp_vcf = NamedTemporaryFile(
5212                prefix=self.get_prefix(),
5213                dir=self.get_tmp_dir(),
5214                suffix=".vcf.gz",
5215                delete=False,
5216            )
5217            tmp_vcf_name = tmp_vcf.name
5218            tmp_files.append(tmp_vcf_name)
5219            tmp_files.append(tmp_vcf_name + ".tbi")
5220
5221            # Export VCF file
5222            self.export_variant_vcf(
5223                vcf_file=tmp_vcf_name,
5224                remove_info=".",
5225                add_samples=False,
5226                index=True,
5227            )
5228
5229            # Create file for field rename
5230            log.debug("Create file for field rename")
5231            tmp_rename = NamedTemporaryFile(
5232                prefix=self.get_prefix(),
5233                dir=self.get_tmp_dir(),
5234                suffix=".rename",
5235                delete=False,
5236            )
5237            tmp_rename_name = tmp_rename.name
5238            tmp_files.append(tmp_rename_name)
5239
5240            # Check Annovar database
5241            log.debug(
5242                f"Check Annovar databases {[assembly]}: {list(annotations.keys())}"
5243            )
5244            databases_download_annovar(
5245                folder=annovar_databases,
5246                files=list(annotations.keys()),
5247                assemblies=[assembly],
5248            )
5249
5250            for annotation in annotations:
5251                annotation_fields = annotations[annotation]
5252
5253                if not annotation_fields:
5254                    annotation_fields = {"INFO": None}
5255
5256                log.info(f"Annotations Annovar - database '{annotation}'")
5257                log.debug(f"Annotation '{annotation}' - fields: {annotation_fields}")
5258
5259                # Tmp file for annovar
5260                err_files = []
5261                tmp_annotate_vcf_directory = TemporaryDirectory(
5262                    prefix=self.get_prefix(), dir=self.get_tmp_dir(), suffix=".annovar"
5263                )
5264                tmp_annotate_vcf_prefix = tmp_annotate_vcf_directory.name + "/annovar"
5265                tmp_annotate_vcf_name_annovar = (
5266                    tmp_annotate_vcf_prefix + "." + assembly + "_multianno.vcf"
5267                )
5268                tmp_annotate_vcf_name_err = tmp_annotate_vcf_directory.name + "/.err"
5269                err_files.append(tmp_annotate_vcf_name_err)
5270                tmp_files.append(tmp_annotate_vcf_name_err)
5271
5272                # Tmp file final vcf annotated by annovar
5273                tmp_annotate_vcf = NamedTemporaryFile(
5274                    prefix=self.get_prefix(),
5275                    dir=self.get_tmp_dir(),
5276                    suffix=".vcf.gz",
5277                    delete=False,
5278                )
5279                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5280                tmp_annotates_vcf_name_list.append(tmp_annotate_vcf_name)
5281                tmp_files.append(tmp_annotate_vcf_name)
5282                tmp_files.append(tmp_annotate_vcf_name + ".tbi")
5283
5284                # Number of fields
5285                annotation_list = []
5286                annotation_renamed_list = []
5287
5288                for annotation_field in annotation_fields:
5289
5290                    # field new name, if parametered SKIPPED !!!!!! not managed actually TODO
5291                    annotation_fields_new_name = annotation_fields.get(
5292                        annotation_field, annotation_field
5293                    )
5294                    if not annotation_fields_new_name:
5295                        annotation_fields_new_name = annotation_field
5296
5297                    if (
5298                        force_update_annotation
5299                        or annotation_fields_new_name not in self.get_header().infos
5300                    ):
5301                        annotation_list.append(annotation_field)
5302                        annotation_renamed_list.append(annotation_fields_new_name)
5303                    else:  # annotation_fields_new_name in self.get_header().infos and not force_update_annotation:
5304                        log.warning(
5305                            f"Annotation '{annotation}' - '{annotation_fields_new_name}' - already exists (skipped)"
5306                        )
5307
5308                    # Add rename info
5309                    run_parallel_commands(
5310                        [
5311                            f"echo 'INFO/{annotation_field} {annotation_fields_new_name}' >> {tmp_rename_name}"
5312                        ],
5313                        1,
5314                    )
5315
5316                # log.debug("fields_to_removed: " + str(fields_to_removed))
5317                log.debug("annotation_list: " + str(annotation_list))
5318
5319                # protocol
5320                protocol = annotation
5321
5322                # argument
5323                argument = ""
5324
5325                # operation
5326                operation = "f"
5327                if annotation in ["refGene", "refGeneWithVer"] or annotation.startswith(
5328                    "ensGene"
5329                ):
5330                    operation = "g"
5331                    if options.get("genebase", None):
5332                        argument = f"""'{options.get("genebase","")}'"""
5333                elif annotation in ["cytoBand"]:
5334                    operation = "r"
5335
5336                # argument option
5337                argument_option = ""
5338                if argument != "":
5339                    argument_option = " --argument " + argument
5340
5341                # command options
5342                command_options = f""" --nastring . --vcfinput --polish --dot2underline --thread {threads} """  # --intronhgvs 10
5343                for option in options:
5344                    if option not in ["genebase"]:
5345                        command_options += f""" --{option}={options[option]}"""
5346
5347                # Command
5348
5349                # Command - Annovar
5350                command_annovar = f"""{annovar_bin_command} {tmp_vcf_name} {annovar_databases_assembly} --buildver {assembly} --outfile {tmp_annotate_vcf_prefix} --remove --protocol {protocol} --operation {operation} {argument_option} {command_options} 2>>{tmp_annotate_vcf_name_err} && mv {tmp_annotate_vcf_name_annovar} {tmp_annotate_vcf_name}.tmp.vcf """
5351                tmp_files.append(f"{tmp_annotate_vcf_name}.tmp.vcf")
5352
5353                # Command - start pipe
5354                command_annovar += f""" && {bcftools_bin_command} view --threads={threads} {tmp_annotate_vcf_name}.tmp.vcf 2>>{tmp_annotate_vcf_name_err} """
5355
5356                # Command - Clean INFO/ANNOVAR_DATE (due to Annovar issue with multiple TAGS!)
5357                command_annovar += """ | sed "s/ANNOVAR_DATE=[^;\t]*;//gi" """
5358
5359                # Command - Special characters (refGene annotation)
5360                command_annovar += """ | sed "s/\\\\\\x3b/,/gi" """
5361
5362                # Command - Clean empty fields (with value ".")
5363                command_annovar += """ | awk -F'\\t' -v OFS='\\t' '{if ($0 ~ /^#/) print; else {split($8,a,";");for(i=1;i<=length(a);i++) {split(a[i],b,"=");if(b[2]!=".") {c[b[1]]=b[2]}}; split($8,d,";");for(i=1;i<=length(d);i++) {split(d[i],e,"=");if(c[e[1]]!="") {if(info!="") {info=info";"}; info=info""e[1]"="c[e[1]]}}; if(info!="") {$8=info} else {$8=""}; delete c; info=""; print}}' """
5364
5365                # Command - Extract only needed fields, and remove ANNOVAR fields, and compress and index final file
5366                annovar_fields_to_keep = ["INFO/ANNOVAR_DATE", "INFO/ALLELE_END"]
5367                if "ALL" not in annotation_list and "INFO" not in annotation_list:
5368                    # for ann in annotation_renamed_list:
5369                    for ann in annotation_list:
5370                        annovar_fields_to_keep.append(f"^INFO/{ann}")
5371
5372                command_annovar += f""" | {bcftools_bin_command} annotate --pair-logic exact --threads={threads} -x {",".join(annovar_fields_to_keep)} --rename-annots={tmp_rename_name} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} """
5373
5374                # Command - indexing
5375                command_annovar += f"""  && tabix {tmp_annotate_vcf_name} """
5376
5377                log.debug(f"Annotation - Annovar command: {command_annovar}")
5378                run_parallel_commands([command_annovar], 1)
5379
5380                # Error messages
5381                log.info(f"Error/Warning messages:")
5382                error_message_command_all = []
5383                error_message_command_warning = []
5384                error_message_command_err = []
5385                for err_file in err_files:
5386                    with open(err_file, "r") as f:
5387                        for line in f:
5388                            message = line.strip()
5389                            error_message_command_all.append(message)
5390                            if line.startswith("[W::") or line.startswith("WARNING"):
5391                                error_message_command_warning.append(message)
5392                            if line.startswith("[E::") or line.startswith("ERROR"):
5393                                error_message_command_err.append(
5394                                    f"{err_file}: " + message
5395                                )
5396                # log info
5397                for message in list(
5398                    set(error_message_command_err + error_message_command_warning)
5399                ):
5400                    log.info(f"   {message}")
5401                # debug info
5402                for message in list(set(error_message_command_all)):
5403                    log.debug(f"   {message}")
5404                # failed
5405                if len(error_message_command_err):
5406                    log.error("Annotation failed: Error in commands")
5407                    raise ValueError("Annotation failed: Error in commands")
5408
5409            if tmp_annotates_vcf_name_list:
5410
5411                # List of annotated files
5412                tmp_annotates_vcf_name_to_merge = " ".join(tmp_annotates_vcf_name_list)
5413
5414                # Tmp file
5415                tmp_annotate_vcf = NamedTemporaryFile(
5416                    prefix=self.get_prefix(),
5417                    dir=self.get_tmp_dir(),
5418                    suffix=".vcf.gz",
5419                    delete=False,
5420                )
5421                tmp_annotate_vcf_name = tmp_annotate_vcf.name
5422                tmp_files.append(tmp_annotate_vcf_name)
5423                tmp_annotate_vcf_name_err = tmp_annotate_vcf_name + ".err"
5424                err_files.append(tmp_annotate_vcf_name_err)
5425                tmp_files.append(tmp_annotate_vcf_name_err)
5426
5427                # Command merge
5428                merge_command = f"{bcftools_bin_command} merge --force-samples --threads={threads} {tmp_vcf_name} {tmp_annotates_vcf_name_to_merge} -o {tmp_annotate_vcf_name} -Oz 2>>{tmp_annotate_vcf_name_err} "
5429                log.info(
5430                    f"Annotation Annovar - Annotation merging "
5431                    + str(len(tmp_annotates_vcf_name_list))
5432                    + " annotated files"
5433                )
5434                log.debug(f"Annotation - merge command: {merge_command}")
5435                run_parallel_commands([merge_command], 1)
5436
5437                # Find annotation in header
5438                with bgzf.open(tmp_annotate_vcf_name, "rt") as f:
5439                    header_list = self.read_vcf_header(f)
5440                annovar_vcf_header = vcf.Reader(io.StringIO("\n".join(header_list)))
5441
5442                for ann in annovar_vcf_header.infos:
5443                    if ann not in self.get_header().infos:
5444                        vcf_reader.infos[ann] = annovar_vcf_header.infos.get(ann)
5445
5446                # Update variants
5447                log.info(f"Annotation Annovar - Updating...")
5448                self.update_from_vcf(tmp_annotate_vcf_name)
5449
5450            # Clean files
5451            # Tmp file remove command
5452            if True:
5453                tmp_files_remove_command = ""
5454                if tmp_files:
5455                    tmp_files_remove_command = " ".join(tmp_files)
5456                clean_command = f" rm -f {tmp_files_remove_command} "
5457                log.debug(f"Annotation Annovar - Annotation cleaning ")
5458                log.debug(f"Annotation - cleaning command: {clean_command}")
5459                run_parallel_commands([clean_command], 1)

It takes a VCF file, annotates it with Annovar, and then updates the database with the new annotations

Parameters
  • threads: number of threads to use
Returns

the value of the variable "return_value".

def annotation_parquet(self, threads: int = None) -> None:
5462    def annotation_parquet(self, threads: int = None) -> None:
5463        """
5464        It takes a VCF file, and annotates it with a parquet file
5465
5466        :param threads: number of threads to use for the annotation
5467        :return: the value of the variable "result".
5468        """
5469
5470        # DEBUG
5471        log.debug("Start annotation with parquet databases")
5472
5473        # Threads
5474        if not threads:
5475            threads = self.get_threads()
5476        log.debug("Threads: " + str(threads))
5477
5478        # DEBUG
5479        delete_tmp = True
5480        if self.get_config().get("verbosity", "warning") in ["debug"]:
5481            delete_tmp = False
5482            log.debug("Delete tmp files/folders: " + str(delete_tmp))
5483
5484        # Config
5485        databases_folders = set(
5486            self.get_config()
5487            .get("folders", {})
5488            .get("databases", {})
5489            .get("annotations", ["."])
5490            + self.get_config()
5491            .get("folders", {})
5492            .get("databases", {})
5493            .get("parquet", ["."])
5494        )
5495        log.debug("Databases annotations: " + str(databases_folders))
5496
5497        # Param
5498        annotations = (
5499            self.get_param()
5500            .get("annotation", {})
5501            .get("parquet", {})
5502            .get("annotations", None)
5503        )
5504        log.debug("Annotations: " + str(annotations))
5505
5506        # Assembly
5507        assembly = self.get_param().get(
5508            "assembly", self.get_config().get("assembly", DEFAULT_ASSEMBLY)
5509        )
5510
5511        # Force Update Annotation
5512        force_update_annotation = (
5513            self.get_param()
5514            .get("annotation", {})
5515            .get("options", {})
5516            .get("annotations_update", False)
5517        )
5518        log.debug(f"force_update_annotation={force_update_annotation}")
5519        force_append_annotation = (
5520            self.get_param()
5521            .get("annotation", {})
5522            .get("options", {})
5523            .get("annotations_append", False)
5524        )
5525        log.debug(f"force_append_annotation={force_append_annotation}")
5526
5527        # Data
5528        table_variants = self.get_table_variants()
5529
5530        # Check if not empty
5531        log.debug("Check if not empty")
5532        sql_query_chromosomes_df = self.get_query_to_df(
5533            f"""SELECT count(*) as count FROM {table_variants} as table_variants LIMIT 1"""
5534        )
5535        if not sql_query_chromosomes_df["count"][0]:
5536            log.info(f"VCF empty")
5537            return
5538
5539        # VCF header
5540        vcf_reader = self.get_header()
5541        log.debug("Initial header: " + str(vcf_reader.infos))
5542
5543        # Nb Variants POS
5544        log.debug("NB Variants Start")
5545        nb_variants = self.conn.execute(
5546            f"SELECT count(*) AS count FROM variants"
5547        ).fetchdf()["count"][0]
5548        log.debug("NB Variants Stop")
5549
5550        # Existing annotations
5551        for vcf_annotation in self.get_header().infos:
5552
5553            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
5554            log.debug(
5555                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
5556            )
5557
5558        # Added columns
5559        added_columns = []
5560
5561        # drop indexes
5562        log.debug(f"Drop indexes...")
5563        self.drop_indexes()
5564
5565        if annotations:
5566
5567            if "ALL" in annotations:
5568
5569                all_param = annotations.get("ALL", {})
5570                all_param_formats = all_param.get("formats", None)
5571                all_param_releases = all_param.get("releases", None)
5572
5573                databases_infos_dict = self.scan_databases(
5574                    database_formats=all_param_formats,
5575                    database_releases=all_param_releases,
5576                )
5577                for database_infos in databases_infos_dict.keys():
5578                    if database_infos not in annotations:
5579                        annotations[database_infos] = {"INFO": None}
5580
5581            for annotation in annotations:
5582
5583                if annotation in ["ALL"]:
5584                    continue
5585
5586                # Annotation Name
5587                annotation_name = os.path.basename(annotation)
5588
5589                # Annotation fields
5590                annotation_fields = annotations[annotation]
5591                if not annotation_fields:
5592                    annotation_fields = {"INFO": None}
5593
5594                log.debug(f"Annotation '{annotation_name}'")
5595                log.debug(
5596                    f"Annotation '{annotation_name}' - fields: {annotation_fields}"
5597                )
5598
5599                # Create Database
5600                database = Database(
5601                    database=annotation,
5602                    databases_folders=databases_folders,
5603                    assembly=assembly,
5604                )
5605
5606                # Find files
5607                parquet_file = database.get_database()
5608                parquet_hdr_file = database.get_header_file()
5609                parquet_type = database.get_type()
5610
5611                # Check if files exists
5612                if not parquet_file or not parquet_hdr_file:
5613                    log.error("Annotation failed: file not found")
5614                    raise ValueError("Annotation failed: file not found")
5615                else:
5616                    # Get parquet connexion
5617                    parquet_sql_attach = database.get_sql_database_attach(
5618                        output="query"
5619                    )
5620                    if parquet_sql_attach:
5621                        self.conn.execute(parquet_sql_attach)
5622                    parquet_file_link = database.get_sql_database_link()
5623                    # Log
5624                    log.debug(
5625                        f"Annotation '{annotation_name}' - file: "
5626                        + str(parquet_file)
5627                        + " and "
5628                        + str(parquet_hdr_file)
5629                    )
5630
5631                    # Database full header columns
5632                    parquet_hdr_vcf_header_columns = database.get_header_file_columns(
5633                        parquet_hdr_file
5634                    )
5635                    # Log
5636                    log.debug(
5637                        "Annotation database header columns : "
5638                        + str(parquet_hdr_vcf_header_columns)
5639                    )
5640
5641                    # Load header as VCF object
5642                    parquet_hdr_vcf_header_infos = database.get_header().infos
5643                    # Log
5644                    log.debug(
5645                        "Annotation database header: "
5646                        + str(parquet_hdr_vcf_header_infos)
5647                    )
5648
5649                    # Get extra infos
5650                    parquet_columns = database.get_extra_columns()
5651                    # Log
5652                    log.debug("Annotation database Columns: " + str(parquet_columns))
5653
5654                    # Add extra columns if "ALL" in annotation_fields
5655                    # if "ALL" in annotation_fields:
5656                    #     allow_add_extra_column = True
5657                    if "ALL" in annotation_fields and database.get_extra_columns():
5658                        for extra_column in database.get_extra_columns():
5659                            if (
5660                                extra_column not in annotation_fields
5661                                and extra_column.replace("INFO/", "")
5662                                not in parquet_hdr_vcf_header_infos
5663                            ):
5664                                parquet_hdr_vcf_header_infos[extra_column] = (
5665                                    vcf.parser._Info(
5666                                        extra_column,
5667                                        ".",
5668                                        "String",
5669                                        f"{extra_column} description",
5670                                        "unknown",
5671                                        "unknown",
5672                                        self.code_type_map["String"],
5673                                    )
5674                                )
5675
5676                    # For all fields in database
5677                    annotation_fields_all = False
5678                    if "ALL" in annotation_fields or "INFO" in annotation_fields:
5679                        annotation_fields_all = True
5680                        annotation_fields = {
5681                            key: key for key in parquet_hdr_vcf_header_infos
5682                        }
5683
5684                        log.debug(
5685                            "Annotation database header - All annotations added: "
5686                            + str(annotation_fields)
5687                        )
5688
5689                    # Init
5690
5691                    # List of annotation fields to use
5692                    sql_query_annotation_update_info_sets = []
5693
5694                    # List of annotation to agregate
5695                    sql_query_annotation_to_agregate = []
5696
5697                    # Number of fields
5698                    nb_annotation_field = 0
5699
5700                    # Annotation fields processed
5701                    annotation_fields_processed = []
5702
5703                    # Columns mapping
5704                    map_columns = database.map_columns(
5705                        columns=annotation_fields, prefixes=["INFO/"]
5706                    )
5707
5708                    # Query dict for fields to remove (update option)
5709                    query_dict_remove = {}
5710
5711                    # Fetch Anotation fields
5712                    for annotation_field in annotation_fields:
5713
5714                        # annotation_field_column
5715                        annotation_field_column = map_columns.get(
5716                            annotation_field, "INFO"
5717                        )
5718
5719                        # field new name, if parametered
5720                        annotation_fields_new_name = annotation_fields.get(
5721                            annotation_field, annotation_field
5722                        )
5723                        if not annotation_fields_new_name:
5724                            annotation_fields_new_name = annotation_field
5725
5726                        # To annotate
5727                        # force_update_annotation = True
5728                        # force_append_annotation = True
5729                        # if annotation_field in parquet_hdr_vcf_header_infos and (force_update_annotation or (annotation_fields_new_name not in self.get_header().infos)):
5730                        if annotation_field in parquet_hdr_vcf_header_infos and (
5731                            force_update_annotation
5732                            or force_append_annotation
5733                            or (
5734                                annotation_fields_new_name
5735                                not in self.get_header().infos
5736                            )
5737                        ):
5738
5739                            # Add field to annotation to process list
5740                            annotation_fields_processed.append(
5741                                annotation_fields_new_name
5742                            )
5743
5744                            # explode infos for the field
5745                            annotation_fields_new_name_info_msg = ""
5746                            if (
5747                                force_update_annotation
5748                                and annotation_fields_new_name
5749                                in self.get_header().infos
5750                            ):
5751                                # Remove field from INFO
5752                                query = f"""
5753                                    UPDATE {table_variants} as table_variants
5754                                    SET INFO = REGEXP_REPLACE(
5755                                                concat(table_variants.INFO,''),
5756                                                ';*{annotation_fields_new_name}=[^;]*',
5757                                                ''
5758                                                )
5759                                    WHERE concat(';',table_variants.INFO) LIKE '%;{annotation_fields_new_name}=%'
5760                                """
5761                                annotation_fields_new_name_info_msg = " [update]"
5762                                query_dict_remove[
5763                                    f"remove 'INFO/{annotation_fields_new_name}'"
5764                                ] = query
5765
5766                            # Sep between fields in INFO
5767                            nb_annotation_field += 1
5768                            if nb_annotation_field > 1:
5769                                annotation_field_sep = ";"
5770                            else:
5771                                annotation_field_sep = ""
5772
5773                            log.info(
5774                                f"Annotation '{annotation_name}' - '{annotation_field}' -> '{annotation_fields_new_name}'{annotation_fields_new_name_info_msg}"
5775                            )
5776
5777                            # Add INFO field to header
5778                            parquet_hdr_vcf_header_infos_number = (
5779                                parquet_hdr_vcf_header_infos[annotation_field].num
5780                                or "."
5781                            )
5782                            parquet_hdr_vcf_header_infos_type = (
5783                                parquet_hdr_vcf_header_infos[annotation_field].type
5784                                or "String"
5785                            )
5786                            parquet_hdr_vcf_header_infos_description = (
5787                                parquet_hdr_vcf_header_infos[annotation_field].desc
5788                                or f"{annotation_field} description"
5789                            )
5790                            parquet_hdr_vcf_header_infos_source = (
5791                                parquet_hdr_vcf_header_infos[annotation_field].source
5792                                or "unknown"
5793                            )
5794                            parquet_hdr_vcf_header_infos_version = (
5795                                parquet_hdr_vcf_header_infos[annotation_field].version
5796                                or "unknown"
5797                            )
5798
5799                            vcf_reader.infos[annotation_fields_new_name] = (
5800                                vcf.parser._Info(
5801                                    annotation_fields_new_name,
5802                                    parquet_hdr_vcf_header_infos_number,
5803                                    parquet_hdr_vcf_header_infos_type,
5804                                    parquet_hdr_vcf_header_infos_description,
5805                                    parquet_hdr_vcf_header_infos_source,
5806                                    parquet_hdr_vcf_header_infos_version,
5807                                    self.code_type_map[
5808                                        parquet_hdr_vcf_header_infos_type
5809                                    ],
5810                                )
5811                            )
5812
5813                            # Append
5814                            if force_append_annotation:
5815                                query_case_when_append = f""" AND REGEXP_EXTRACT(concat(';', table_variants.INFO), ';{annotation_fields_new_name}=([^;]*)',1) IN ('','.') """
5816                            else:
5817                                query_case_when_append = ""
5818
5819                            # Annotation/Update query fields
5820                            # Found in INFO column
5821                            if (
5822                                annotation_field_column == "INFO"
5823                                and "INFO" in parquet_hdr_vcf_header_columns
5824                            ):
5825                                sql_query_annotation_update_info_sets.append(
5826                                    f"""
5827                                CASE WHEN REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1) NOT IN ('','.') {query_case_when_append}
5828                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', REGEXP_EXTRACT(concat(';', table_parquet.INFO), ';{annotation_field}=([^;]*)',1))
5829                                        ELSE ''
5830                                    END
5831                                """
5832                                )
5833                            # Found in a specific column
5834                            else:
5835                                sql_query_annotation_update_info_sets.append(
5836                                    f"""
5837                                CASE WHEN CAST(table_parquet."{annotation_field_column}" AS VARCHAR) NOT IN ('','.') {query_case_when_append}
5838                                        THEN concat('{annotation_field_sep}', '{annotation_fields_new_name}=', replace(CAST(table_parquet."{annotation_field_column}" AS VARCHAR), ';', ','))
5839                                        ELSE ''
5840                                    END
5841                                """
5842                                )
5843                                sql_query_annotation_to_agregate.append(
5844                                    f""" string_agg(DISTINCT table_parquet_from."{annotation_field_column}", ',') AS "{annotation_field_column}" """
5845                                )
5846
5847                        # Not to annotate
5848                        else:
5849
5850                            if force_update_annotation:
5851                                annotation_message = "forced"
5852                            else:
5853                                annotation_message = "skipped"
5854
5855                            if annotation_field not in parquet_hdr_vcf_header_infos:
5856                                log.warning(
5857                                    f"Annotation '{annotation_name}' - '{annotation_field}' [{nb_annotation_field}] - not available in parquet file"
5858                                )
5859                            if annotation_fields_new_name in self.get_header().infos:
5860                                log.warning(
5861                                    f"Annotation '{annotation_name}' - '{annotation_fields_new_name}' [{nb_annotation_field}] - already exists in header ({annotation_message})"
5862                                )
5863
5864                    # Check if ALL fields have to be annotated. Thus concat all INFO field
5865                    # allow_annotation_full_info = True
5866                    allow_annotation_full_info = not force_append_annotation
5867
5868                    if parquet_type in ["regions"]:
5869                        allow_annotation_full_info = False
5870
5871                    if (
5872                        allow_annotation_full_info
5873                        and nb_annotation_field == len(annotation_fields)
5874                        and annotation_fields_all
5875                        and (
5876                            "INFO" in parquet_hdr_vcf_header_columns
5877                            and "INFO" in database.get_extra_columns()
5878                        )
5879                    ):
5880                        log.debug("Column INFO annotation enabled")
5881                        sql_query_annotation_update_info_sets = []
5882                        sql_query_annotation_update_info_sets.append(
5883                            f" table_parquet.INFO "
5884                        )
5885
5886                    if sql_query_annotation_update_info_sets:
5887
5888                        # Annotate
5889                        log.info(f"Annotation '{annotation_name}' - Annotation...")
5890
5891                        # Join query annotation update info sets for SQL
5892                        sql_query_annotation_update_info_sets_sql = ",".join(
5893                            sql_query_annotation_update_info_sets
5894                        )
5895
5896                        # Check chromosomes list (and variants infos)
5897                        sql_query_chromosomes = f"""
5898                            SELECT table_variants."#CHROM" as CHROM, count(*) AS count_variants, min(POS) AS min_variants, MAX(POS) AS max_variants
5899                            FROM {table_variants} as table_variants
5900                            GROUP BY table_variants."#CHROM"
5901                            ORDER BY table_variants."#CHROM"
5902                            """
5903                        sql_query_chromosomes_df = self.conn.execute(
5904                            sql_query_chromosomes
5905                        ).df()
5906                        sql_query_chromosomes_dict = {
5907                            entry["CHROM"]: {
5908                                "count": entry["count_variants"],
5909                                "min": entry["min_variants"],
5910                                "max": entry["max_variants"],
5911                            }
5912                            for index, entry in sql_query_chromosomes_df.iterrows()
5913                        }
5914
5915                        # Init
5916                        nb_of_query = 0
5917                        nb_of_variant_annotated = 0
5918                        query_dict = query_dict_remove
5919
5920                        # for chrom in sql_query_chromosomes_df["CHROM"]:
5921                        for chrom in sql_query_chromosomes_dict:
5922
5923                            # Number of variant by chromosome
5924                            nb_of_variant_by_chrom = sql_query_chromosomes_dict.get(
5925                                chrom, {}
5926                            ).get("count", 0)
5927
5928                            log.debug(
5929                                f"Annotation '{annotation_name}' - Chromosome '{chrom}' [{nb_of_variant_by_chrom} variants]..."
5930                            )
5931
5932                            # Annotation with regions database
5933                            if parquet_type in ["regions"]:
5934                                sql_query_annotation_from_clause = f"""
5935                                    FROM (
5936                                        SELECT 
5937                                            '{chrom}' AS \"#CHROM\",
5938                                            table_variants_from.\"POS\" AS \"POS\",
5939                                            {",".join(sql_query_annotation_to_agregate)}
5940                                        FROM {table_variants} as table_variants_from
5941                                        LEFT JOIN {parquet_file_link} as table_parquet_from ON (
5942                                            table_parquet_from."#CHROM" = '{chrom}'
5943                                            AND table_variants_from.\"POS\" <= table_parquet_from.\"END\"
5944                                            AND (table_variants_from.\"POS\" >= (table_parquet_from.\"START\"+1)
5945                                                OR table_variants_from.\"POS\" + (len(table_variants_from.\"REF\")-1) >= (table_parquet_from.\"START\"+1)
5946                                                )
5947                                        )
5948                                        WHERE table_variants_from.\"#CHROM\" in ('{chrom}')
5949                                        GROUP BY table_variants_from.\"POS\"
5950                                        )
5951                                        as table_parquet
5952                                """
5953
5954                                sql_query_annotation_where_clause = """
5955                                    table_parquet.\"#CHROM\" = table_variants.\"#CHROM\"
5956                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5957                                """
5958
5959                            # Annotation with variants database
5960                            else:
5961                                sql_query_annotation_from_clause = f"""
5962                                    FROM {parquet_file_link} as table_parquet
5963                                """
5964                                sql_query_annotation_where_clause = f"""
5965                                    table_variants."#CHROM" = '{chrom}'
5966                                    AND table_parquet.\"#CHROM\" = table_variants.\"#CHROM\" 
5967                                    AND table_parquet.\"POS\" = table_variants.\"POS\"
5968                                    AND table_parquet.\"ALT\" = table_variants.\"ALT\"
5969                                    AND table_parquet.\"REF\" = table_variants.\"REF\"
5970                                """
5971
5972                            # Create update query
5973                            sql_query_annotation_chrom_interval_pos = f"""
5974                                UPDATE {table_variants} as table_variants
5975                                    SET INFO = 
5976                                        concat(
5977                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5978                                                THEN table_variants.INFO
5979                                                ELSE ''
5980                                            END
5981                                            ,
5982                                            CASE WHEN table_variants.INFO NOT IN ('','.')
5983                                                        AND (
5984                                                        concat({sql_query_annotation_update_info_sets_sql})
5985                                                        )
5986                                                        NOT IN ('','.') 
5987                                                    THEN ';'
5988                                                    ELSE ''
5989                                            END
5990                                            ,
5991                                            {sql_query_annotation_update_info_sets_sql}
5992                                            )
5993                                    {sql_query_annotation_from_clause}
5994                                    WHERE {sql_query_annotation_where_clause}
5995                                    ;
5996                                """
5997
5998                            # Add update query to dict
5999                            query_dict[
6000                                f"{chrom} [{nb_of_variant_by_chrom} variants]"
6001                            ] = sql_query_annotation_chrom_interval_pos
6002
6003                        nb_of_query = len(query_dict)
6004                        num_query = 0
6005
6006                        # SET max_expression_depth TO x
6007                        self.conn.execute("SET max_expression_depth TO 10000")
6008
6009                        for query_name in query_dict:
6010                            query = query_dict[query_name]
6011                            num_query += 1
6012                            log.info(
6013                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name}..."
6014                            )
6015                            result = self.conn.execute(query)
6016                            nb_of_variant_annotated_by_query = result.df()["Count"][0]
6017                            nb_of_variant_annotated += nb_of_variant_annotated_by_query
6018                            log.info(
6019                                f"Annotation '{annotation_name}' - Annotation - Query [{num_query}/{nb_of_query}] {query_name} - {nb_of_variant_annotated_by_query} variants annotated"
6020                            )
6021
6022                        log.info(
6023                            f"Annotation '{annotation_name}' - Annotation of {nb_of_variant_annotated} variants out of {nb_variants} (with {nb_of_query} queries)"
6024                        )
6025
6026                    else:
6027
6028                        log.info(
6029                            f"Annotation '{annotation_name}' - No Annotations available"
6030                        )
6031
6032                    log.debug("Final header: " + str(vcf_reader.infos))
6033
6034        # Remove added columns
6035        for added_column in added_columns:
6036            self.drop_column(column=added_column)

It takes a VCF file, and annotates it with a parquet file

Parameters
  • threads: number of threads to use for the annotation
Returns

the value of the variable "result".

def annotation_splice(self, threads: int = None) -> None:
6038    def annotation_splice(self, threads: int = None) -> None:
6039        """
6040        This function annotate with snpEff
6041
6042        :param threads: The number of threads to use
6043        :return: the value of the variable "return_value".
6044        """
6045
6046        # DEBUG
6047        log.debug("Start annotation with splice tools")
6048
6049        # Threads
6050        if not threads:
6051            threads = self.get_threads()
6052        log.debug("Threads: " + str(threads))
6053
6054        # DEBUG
6055        delete_tmp = True
6056        if self.get_config().get("verbosity", "warning") in ["debug"]:
6057            delete_tmp = False
6058            log.debug("Delete tmp files/folders: " + str(delete_tmp))
6059
6060        # Config
6061        config = self.get_config()
6062        log.debug("Config: " + str(config))
6063        splice_config = config.get("tools", {}).get("splice", {})
6064        if not splice_config:
6065            splice_config = DEFAULT_TOOLS_BIN.get("splice", {})
6066        if not splice_config:
6067            msg_err = "No Splice tool config"
6068            log.error(msg_err)
6069            raise ValueError(msg_err)
6070        log.debug(f"splice_config={splice_config}")
6071
6072        # Config - Folders - Databases
6073        databases_folders = (
6074            config.get("folders", {}).get("databases", {}).get("splice", ["."])
6075        )
6076        log.debug("Databases annotations: " + str(databases_folders))
6077
6078        # Splice docker image
6079        splice_docker_image = splice_config.get("docker").get("image")
6080
6081        # Pull splice image if it's not already there
6082        if not check_docker_image_exists(splice_docker_image):
6083            log.warning(
6084                f"Annotation: splice docker image {splice_docker_image} not found locally, trying to pull from dockerhub"
6085            )
6086            try:
6087                command(f"docker pull {splice_config.get('docker').get('image')}")
6088            except subprocess.CalledProcessError:
6089                msg_err = f"Unable to find docker {splice_docker_image} on dockerhub"
6090                log.error(msg_err)
6091                raise ValueError(msg_err)
6092                return None
6093
6094        # Config - splice databases
6095        splice_databases = (
6096            config.get("folders", {})
6097            .get("databases", {})
6098            .get("splice", DEFAULT_SPLICE_FOLDER)
6099        )
6100        splice_databases = full_path(splice_databases)
6101
6102        # Param
6103        param = self.get_param()
6104        log.debug("Param: " + str(param))
6105
6106        # Param
6107        options = param.get("annotation", {}).get("splice", {})
6108        log.debug("Options: " + str(options))
6109
6110        # Data
6111        table_variants = self.get_table_variants()
6112
6113        # Check if not empty
6114        log.debug("Check if not empty")
6115        sql_query_chromosomes = (
6116            f"""SELECT count(*) as count FROM {table_variants} as table_variants"""
6117        )
6118        if not self.get_query_to_df(f"{sql_query_chromosomes}")["count"][0]:
6119            log.info("VCF empty")
6120            return None
6121
6122        # Export in VCF
6123        log.debug("Create initial file to annotate")
6124
6125        # Create output folder
6126        output_folder = os.path.join(self.get_tmp_dir(), f"splice-{get_random()}")
6127        if not os.path.exists(output_folder):
6128            Path(output_folder).mkdir(parents=True, exist_ok=True)
6129
6130        # Create tmp VCF file
6131        tmp_vcf = NamedTemporaryFile(
6132            prefix=self.get_prefix(),
6133            dir=output_folder,
6134            suffix=".vcf",
6135            delete=False,
6136        )
6137        tmp_vcf_name = tmp_vcf.name
6138
6139        # VCF header
6140        header = self.get_header()
6141
6142        # Existing annotations
6143        for vcf_annotation in self.get_header().infos:
6144
6145            vcf_annotation_line = self.get_header().infos.get(vcf_annotation)
6146            log.debug(
6147                f"Existing annotations in VCF: {vcf_annotation} [{vcf_annotation_line}]"
6148            )
6149
6150        # Memory limit
6151        if config.get("memory", None):
6152            memory_limit = config.get("memory", "8G").upper()
6153            # upper()
6154        else:
6155            memory_limit = "8G"
6156        log.debug(f"memory_limit: {memory_limit}")
6157
6158        # Check number of variants to annotate
6159        where_clause_regex_spliceai = r"SpliceAI_\w+"
6160        where_clause_regex_spip = r"SPiP_\w+"
6161        where_clause = f""" WHERE NOT regexp_matches("INFO", '{where_clause_regex_spliceai}') AND NOT regexp_matches("INFO", '{where_clause_regex_spip}')"""
6162        df_list_of_variants_to_annotate = self.get_query_to_df(
6163            query=f""" SELECT * FROM variants {where_clause} """
6164        )
6165        if len(df_list_of_variants_to_annotate) == 0:
6166            log.warning(
6167                f"No variants to annotate with splice. Variants probably already annotated with splice"
6168            )
6169            return None
6170        else:
6171            log.info(f"Annotation: {len(df_list_of_variants_to_annotate)} variants")
6172
6173        # Export VCF file
6174        self.export_variant_vcf(
6175            vcf_file=tmp_vcf_name,
6176            remove_info=True,
6177            add_samples=True,
6178            index=False,
6179            where_clause=where_clause,
6180        )
6181
6182        # Create docker container and launch splice analysis
6183        if splice_config:
6184
6185            # Splice mount folders
6186            mount_folders = splice_config.get("mount", {})
6187
6188            # Genome mount
6189            mount_folders[
6190                config.get("folders", {})
6191                .get("databases", {})
6192                .get("genomes", DEFAULT_GENOME_FOLDER)
6193            ] = "ro"
6194
6195            # SpliceAI mount
6196            mount_folders[
6197                config.get("folders", {})
6198                .get("databases", {})
6199                .get("spliceai", DEFAULT_SPLICEAI_FOLDER)
6200            ] = "ro"
6201
6202            # Genome mount
6203            mount_folders[
6204                config.get("folders", {})
6205                .get("databases", {})
6206                .get("spip", DEFAULT_SPIP_FOLDER)
6207            ] = "ro"
6208
6209            # Mount folders
6210            mount = []
6211
6212            # Config mount
6213            mount = [
6214                f"-v {full_path(path)}:{full_path(path)}:{mode}"
6215                for path, mode in mount_folders.items()
6216            ]
6217
6218            if any(value for value in splice_config.values() if value is None):
6219                log.warning("At least one splice config parameter is empty")
6220                return None
6221
6222            # Params in splice nf
6223            def check_values(dico: dict):
6224                """
6225                Ensure parameters for NF splice pipeline
6226                """
6227                for key, val in dico.items():
6228                    if key == "genome":
6229                        if any(
6230                            assemb in options.get("genome", {})
6231                            for assemb in ["hg19", "GRCh37", "grch37", "GRCH37"]
6232                        ):
6233                            yield f"--{key} hg19"
6234                        elif any(
6235                            assemb in options.get("genome", {})
6236                            for assemb in ["hg38", "GRCh38", "grch38", "GRCH38"]
6237                        ):
6238                            yield f"--{key} hg38"
6239                    elif (
6240                        (isinstance(val, str) and val)
6241                        or isinstance(val, int)
6242                        or isinstance(val, bool)
6243                    ):
6244                        yield f"--{key} {val}"
6245
6246            # Genome
6247            genome = options.get("genome", config.get("assembly", DEFAULT_ASSEMBLY))
6248            options["genome"] = genome
6249
6250            # NF params
6251            nf_params = []
6252
6253            # Add options
6254            if options:
6255                nf_params = list(check_values(options))
6256                log.debug(f"Splice NF params: {' '.join(nf_params)}")
6257            else:
6258                log.debug("No NF params provided")
6259
6260            # Add threads
6261            if "threads" not in options.keys():
6262                nf_params.append(f"--threads {threads}")
6263
6264            # Genome path
6265            genome_path = find_genome(
6266                config.get("folders", {})
6267                .get("databases", {})
6268                .get("genomes", DEFAULT_GENOME_FOLDER),
6269                file=f"{genome}.fa",
6270            )
6271            # Add genome path
6272            if not genome_path:
6273                raise ValueError(
6274                    f"Can't find genome assembly {genome}.fa in {config.get('folders', {}).get('databases', {}).get('genomes', DEFAULT_GENOME_FOLDER)}"
6275                )
6276            else:
6277                log.debug(f"Genome: {genome_path}")
6278                nf_params.append(f"--genome_path {genome_path}")
6279
6280            def splice_annotations(options: dict = {}, config: dict = {}) -> list:
6281                """
6282                Setting up updated databases for SPiP and SpliceAI
6283                """
6284
6285                try:
6286
6287                    # SpliceAI assembly transcriptome
6288                    spliceai_assembly = os.path.join(
6289                        config.get("folders", {})
6290                        .get("databases", {})
6291                        .get("spliceai", {}),
6292                        options.get("genome"),
6293                        "transcriptome",
6294                    )
6295                    spip_assembly = options.get("genome")
6296
6297                    spip = find(
6298                        f"transcriptome_{spip_assembly}.RData",
6299                        config.get("folders", {}).get("databases", {}).get("spip", {}),
6300                    )
6301                    spliceai = find("spliceai.refseq.txt", spliceai_assembly)
6302                    log.debug(f"SPiP annotations: {spip}")
6303                    log.debug(f"SpliceAI annotations: {spliceai}")
6304                    if spip and spliceai:
6305                        return [
6306                            f"--spip_transcriptome {spip}",
6307                            f"--spliceai_annotations {spliceai}",
6308                        ]
6309                    else:
6310                        # TODO crash and go on with basic annotations ?
6311                        # raise ValueError(
6312                        #     "Can't find splice databases in configuration EXIT"
6313                        # )
6314                        log.warning(
6315                            "Can't find splice databases in configuration, use annotations file from image"
6316                        )
6317                except TypeError:
6318                    log.warning(
6319                        "Can't find splice databases in configuration, use annotations file from image"
6320                    )
6321                    return []
6322
6323            # Add options, check if transcriptome option have already beend provided
6324            if (
6325                "spip_transcriptome" not in nf_params
6326                and "spliceai_transcriptome" not in nf_params
6327            ):
6328                splice_reference = splice_annotations(options, config)
6329                if splice_reference:
6330                    nf_params.extend(splice_reference)
6331
6332            nf_params.append(f"--output_folder {output_folder}")
6333
6334            random_uuid = f"HOWARD-SPLICE-{get_random()}"
6335            cmd = f"nextflow -log {os.path.join(output_folder, f'{random_uuid}.log')} -c /app/SpliceToolBox/src/splicetoolbox/nextflow/nextflow.docker.config run /app/SpliceToolBox/src/splicetoolbox/nextflow/main.nf -entry SPLICE --vcf {tmp_vcf_name} {' '.join(nf_params)} -profile standard,conda,singularity,report,timeline"
6336            log.debug(cmd)
6337
6338            splice_config["docker"]["command"] = cmd
6339
6340            docker_cmd = get_bin_command(
6341                tool="splice",
6342                bin_type="docker",
6343                config=config,
6344                default_folder=f"{DEFAULT_TOOLS_FOLDER}/docker",
6345                add_options=f"--name {random_uuid} {' '.join(mount)}",
6346            )
6347
6348            # Docker debug
6349            # if splice_config.get("rm_container"):
6350            #     rm_container = "--rm"
6351            # else:
6352            #     rm_container = ""
6353            # docker_cmd = f"docker run {rm_container} --entrypoint '/bin/bash' --name {random_uuid} {' '.join(mount)} {':'.join(splice_config.get('image'))} {cmd}"
6354
6355            log.debug(docker_cmd)
6356            res = subprocess.run(docker_cmd, shell=True, capture_output=True, text=True)
6357            log.debug(res.stdout)
6358            if res.stderr:
6359                log.error(res.stderr)
6360            res.check_returncode()
6361        else:
6362            log.warning(f"Splice tool configuration not found: {config}")
6363
6364        # Update variants
6365        log.info("Annotation - Updating...")
6366        # Test find output vcf
6367        log.debug(
6368            f"TMP splice output: {os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6369        )
6370        output_vcf = []
6371        # Wrong folder to look in
6372        for files in os.listdir(os.path.dirname(tmp_vcf_name)):
6373            if (
6374                files
6375                == f"{os.path.basename(tmp_vcf_name).replace('.vcf', '')}.spip.spliceai.sorted.vcf.gz"
6376            ):
6377                output_vcf.append(os.path.join(os.path.dirname(tmp_vcf_name), files))
6378        # log.debug(os.listdir(options.get("output_folder")))
6379        log.debug(f"Splice annotated vcf: {output_vcf[0]}")
6380        if not output_vcf:
6381            log.debug(
6382                f"Splice output was not generated {os.path.basename(tmp_vcf_name)}*.spip.spliceai.sorted.vcf.gz"
6383            )
6384        else:
6385            # Get new header from annotated vcf
6386            log.debug(f"Initial header: {len(header.infos)} fields")
6387            # Create new header with splice infos
6388            new_vcf = Variants(input=output_vcf[0])
6389            new_vcf_header = new_vcf.get_header().infos
6390            for keys, infos in new_vcf_header.items():
6391                if keys not in header.infos.keys():
6392                    header.infos[keys] = infos
6393            log.debug(f"New header: {len(header.infos)} fields")
6394            log.debug(f"Splice tmp output: {output_vcf[0]}")
6395            self.update_from_vcf(output_vcf[0])
6396
6397        # Remove folder
6398        remove_if_exists(output_folder)

This function annotate with snpEff

Parameters
  • threads: The number of threads to use
Returns

the value of the variable "return_value".

def get_config_default(self, name: str) -> dict:
6404    def get_config_default(self, name: str) -> dict:
6405        """
6406        The function `get_config_default` returns a dictionary containing default configurations for
6407        various calculations and prioritizations.
6408
6409        :param name: The `get_config_default` function returns a dictionary containing default
6410        configurations for different calculations and prioritizations. The `name` parameter is used to
6411        specify which specific configuration to retrieve from the dictionary
6412        :type name: str
6413        :return: The function `get_config_default` returns a dictionary containing default configuration
6414        settings for different calculations and prioritizations. The specific configuration settings are
6415        retrieved based on the input `name` parameter provided to the function. If the `name` parameter
6416        matches a key in the `config_default` dictionary, the corresponding configuration settings are
6417        returned. If there is no match, an empty dictionary is returned.
6418        """
6419
6420        config_default = {
6421            "calculations": {
6422                "variant_chr_pos_alt_ref": {
6423                    "type": "sql",
6424                    "name": "variant_chr_pos_alt_ref",
6425                    "description": "Create a variant ID with chromosome, position, alt and ref",
6426                    "available": False,
6427                    "output_column_name": "variant_chr_pos_alt_ref",
6428                    "output_column_type": "String",
6429                    "output_column_description": "variant ID with chromosome, position, alt and ref",
6430                    "operation_query": """ concat("#CHROM", '_', "POS", '_', "REF", '_', "ALT") """,
6431                    "operation_info": True,
6432                },
6433                "VARTYPE": {
6434                    "type": "sql",
6435                    "name": "VARTYPE",
6436                    "description": "Variant type (e.g. SNV, INDEL, MNV, BND...)",
6437                    "available": True,
6438                    "output_column_name": "VARTYPE",
6439                    "output_column_type": "String",
6440                    "output_column_description": "Variant type: SNV if X>Y, MOSAIC if X>Y,Z or X,Y>Z, INDEL if XY>Z or X>YZ",
6441                    "operation_query": """
6442                            CASE
6443                                WHEN "SVTYPE" NOT NULL THEN "SVTYPE"
6444                                WHEN LENGTH(REF) = 1 AND LENGTH(ALT) = 1 THEN 'SNV'
6445                                WHEN REF LIKE '%,%' OR ALT LIKE '%,%' THEN 'MOSAIC'
6446                                WHEN LENGTH(REF) == LENGTH(ALT) AND LENGTH(REF) > 1 THEN 'MNV'
6447                                WHEN LENGTH(REF) <> LENGTH(ALT) THEN 'INDEL'
6448                                ELSE 'UNDEFINED'
6449                            END
6450                            """,
6451                    "info_fields": ["SVTYPE"],
6452                    "operation_info": True,
6453                },
6454                "snpeff_hgvs": {
6455                    "type": "python",
6456                    "name": "snpeff_hgvs",
6457                    "description": "HGVS nomenclatures from snpEff annotation",
6458                    "available": True,
6459                    "function_name": "calculation_extract_snpeff_hgvs",
6460                    "function_params": ["snpeff_hgvs", "ANN"],
6461                },
6462                "snpeff_ann_explode": {
6463                    "type": "python",
6464                    "name": "snpeff_ann_explode",
6465                    "description": "Explode snpEff annotations with uniquify values",
6466                    "available": True,
6467                    "function_name": "calculation_snpeff_ann_explode",
6468                    "function_params": [False, "fields", "snpeff_", "ANN"],
6469                },
6470                "snpeff_ann_explode_uniquify": {
6471                    "type": "python",
6472                    "name": "snpeff_ann_explode_uniquify",
6473                    "description": "Explode snpEff annotations",
6474                    "available": True,
6475                    "function_name": "calculation_snpeff_ann_explode",
6476                    "function_params": [True, "fields", "snpeff_uniquify_", "ANN"],
6477                },
6478                "snpeff_ann_explode_json": {
6479                    "type": "python",
6480                    "name": "snpeff_ann_explode_json",
6481                    "description": "Explode snpEff annotations in JSON format",
6482                    "available": True,
6483                    "function_name": "calculation_snpeff_ann_explode",
6484                    "function_params": [False, "JSON", "snpeff_json", "ANN"],
6485                },
6486                "NOMEN": {
6487                    "type": "python",
6488                    "name": "NOMEN",
6489                    "description": "NOMEN information (e.g. NOMEN, CNOMEN, PNOMEN...) from HGVS nomenclature field",
6490                    "available": True,
6491                    "function_name": "calculation_extract_nomen",
6492                    "function_params": [],
6493                },
6494                "FINDBYPIPELINE": {
6495                    "type": "python",
6496                    "name": "FINDBYPIPELINE",
6497                    "description": "Number of pipeline that identify the variant (for multi pipeline VCF)",
6498                    "available": True,
6499                    "function_name": "calculation_find_by_pipeline",
6500                    "function_params": ["findbypipeline"],
6501                },
6502                "FINDBYSAMPLE": {
6503                    "type": "python",
6504                    "name": "FINDBYSAMPLE",
6505                    "description": "Number of sample that have a genotype for the variant (for multi sample VCF)",
6506                    "available": True,
6507                    "function_name": "calculation_find_by_pipeline",
6508                    "function_params": ["findbysample"],
6509                },
6510                "GENOTYPECONCORDANCE": {
6511                    "type": "python",
6512                    "name": "GENOTYPECONCORDANCE",
6513                    "description": "Concordance of genotype for multi caller VCF",
6514                    "available": True,
6515                    "function_name": "calculation_genotype_concordance",
6516                    "function_params": [],
6517                },
6518                "BARCODE": {
6519                    "type": "python",
6520                    "name": "BARCODE",
6521                    "description": "BARCODE as VaRank tool",
6522                    "available": True,
6523                    "function_name": "calculation_barcode",
6524                    "function_params": [],
6525                },
6526                "BARCODEFAMILY": {
6527                    "type": "python",
6528                    "name": "BARCODEFAMILY",
6529                    "description": "BARCODEFAMILY as VaRank tool",
6530                    "available": True,
6531                    "function_name": "calculation_barcode_family",
6532                    "function_params": ["BCF"],
6533                },
6534                "TRIO": {
6535                    "type": "python",
6536                    "name": "TRIO",
6537                    "description": "Inheritance for a trio family",
6538                    "available": True,
6539                    "function_name": "calculation_trio",
6540                    "function_params": [],
6541                },
6542                "VAF": {
6543                    "type": "python",
6544                    "name": "VAF",
6545                    "description": "Variant Allele Frequency (VAF) harmonization",
6546                    "available": True,
6547                    "function_name": "calculation_vaf_normalization",
6548                    "function_params": [],
6549                },
6550                "VAF_stats": {
6551                    "type": "python",
6552                    "name": "VAF_stats",
6553                    "description": "Variant Allele Frequency (VAF) statistics",
6554                    "available": True,
6555                    "function_name": "calculation_genotype_stats",
6556                    "function_params": ["VAF"],
6557                },
6558                "DP_stats": {
6559                    "type": "python",
6560                    "name": "DP_stats",
6561                    "description": "Depth (DP) statistics",
6562                    "available": True,
6563                    "function_name": "calculation_genotype_stats",
6564                    "function_params": ["DP"],
6565                },
6566                "variant_id": {
6567                    "type": "python",
6568                    "name": "variant_id",
6569                    "description": "Variant ID generated from variant position and type",
6570                    "available": True,
6571                    "function_name": "calculation_variant_id",
6572                    "function_params": [],
6573                },
6574                "transcripts_json": {
6575                    "type": "python",
6576                    "name": "transcripts_json",
6577                    "description": "Add transcripts annotations in JSON format (field 'transcripts_json')",
6578                    "available": True,
6579                    "function_name": "calculation_transcripts_annotation",
6580                    "function_params": ["transcripts_json", None],
6581                },
6582                "transcripts_ann": {
6583                    "type": "python",
6584                    "name": "transcripts_ann",
6585                    "description": "Add transcripts annotations in structured format (field 'transcripts_ann')",
6586                    "available": True,
6587                    "function_name": "calculation_transcripts_annotation",
6588                    "function_params": [None, "transcripts_ann"],
6589                },
6590                "transcripts_annotations": {
6591                    "type": "python",
6592                    "name": "transcripts_annotations",
6593                    "description": "Add transcripts annotations in JSON and/or structured format (see param JSON file)",
6594                    "available": True,
6595                    "function_name": "calculation_transcripts_annotation",
6596                    "function_params": [None, None],
6597                },
6598                "transcripts_prioritization": {
6599                    "type": "python",
6600                    "name": "transcripts_prioritization",
6601                    "description": "Prioritize transcripts with a prioritization profile (using param.json)",
6602                    "available": True,
6603                    "function_name": "calculation_transcripts_prioritization",
6604                    "function_params": [],
6605                },
6606            },
6607            "prioritizations": {
6608                "default": {
6609                    "ANN2": [
6610                        {
6611                            "type": "contains",
6612                            "value": "HIGH",
6613                            "score": 5,
6614                            "flag": "PASS",
6615                            "comment": [
6616                                "The variant is assumed to have high (disruptive) impact in the protein, probably causing protein truncation, loss of function or triggering nonsense mediated decay"
6617                            ],
6618                        },
6619                        {
6620                            "type": "contains",
6621                            "value": "MODERATE",
6622                            "score": 3,
6623                            "flag": "PASS",
6624                            "comment": [
6625                                "A non-disruptive variant that might change protein effectiveness"
6626                            ],
6627                        },
6628                        {
6629                            "type": "contains",
6630                            "value": "LOW",
6631                            "score": 0,
6632                            "flag": "FILTERED",
6633                            "comment": [
6634                                "Assumed to be mostly harmless or unlikely to change protein behavior"
6635                            ],
6636                        },
6637                        {
6638                            "type": "contains",
6639                            "value": "MODIFIER",
6640                            "score": 0,
6641                            "flag": "FILTERED",
6642                            "comment": [
6643                                "Usually non-coding variants or variants affecting non-coding genes, where predictions are difficult or there is no evidence of impact"
6644                            ],
6645                        },
6646                    ],
6647                }
6648            },
6649        }
6650
6651        return config_default.get(name, None)

The function get_config_default returns a dictionary containing default configurations for various calculations and prioritizations.

Parameters
  • name: The get_config_default function returns a dictionary containing default configurations for different calculations and prioritizations. The name parameter is used to specify which specific configuration to retrieve from the dictionary
Returns

The function get_config_default returns a dictionary containing default configuration settings for different calculations and prioritizations. The specific configuration settings are retrieved based on the input name parameter provided to the function. If the name parameter matches a key in the config_default dictionary, the corresponding configuration settings are returned. If there is no match, an empty dictionary is returned.

def get_config_json(self, name: str, config_dict: dict = {}, config_file: str = None) -> dict:
6653    def get_config_json(
6654        self, name: str, config_dict: dict = {}, config_file: str = None
6655    ) -> dict:
6656        """
6657        The function `get_config_json` retrieves a configuration JSON object with prioritizations from
6658        default values, a dictionary, and a file.
6659
6660        :param name: The `name` parameter in the `get_config_json` function is a string that represents
6661        the name of the configuration. It is used to identify and retrieve the configuration settings
6662        for a specific component or module
6663        :type name: str
6664        :param config_dict: The `config_dict` parameter in the `get_config_json` function is a
6665        dictionary that allows you to provide additional configuration settings or overrides. When you
6666        call the `get_config_json` function, you can pass a dictionary containing key-value pairs where
6667        the key is the configuration setting you want to override or
6668        :type config_dict: dict
6669        :param config_file: The `config_file` parameter in the `get_config_json` function is used to
6670        specify the path to a configuration file that contains additional settings. If provided, the
6671        function will read the contents of this file and update the configuration dictionary with the
6672        values found in the file, overriding any existing values with the
6673        :type config_file: str
6674        :return: The function `get_config_json` returns a dictionary containing the configuration
6675        settings.
6676        """
6677
6678        # Create with default prioritizations
6679        config_default = self.get_config_default(name=name)
6680        configuration = config_default
6681        # log.debug(f"configuration={configuration}")
6682
6683        # Replace prioritizations from dict
6684        for config in config_dict:
6685            configuration[config] = config_dict[config]
6686
6687        # Replace prioritizations from file
6688        config_file = full_path(config_file)
6689        if config_file:
6690            if os.path.exists(config_file):
6691                with open(config_file) as config_file_content:
6692                    config_file_dict = json.load(config_file_content)
6693                for config in config_file_dict:
6694                    configuration[config] = config_file_dict[config]
6695            else:
6696                msg_error = f"Config '{name}' file '{config_file}' does NOT exist"
6697                log.error(msg_error)
6698                raise ValueError(msg_error)
6699
6700        return configuration

The function get_config_json retrieves a configuration JSON object with prioritizations from default values, a dictionary, and a file.

Parameters
  • name: The name parameter in the get_config_json function is a string that represents the name of the configuration. It is used to identify and retrieve the configuration settings for a specific component or module
  • config_dict: The config_dict parameter in the get_config_json function is a dictionary that allows you to provide additional configuration settings or overrides. When you call the get_config_json function, you can pass a dictionary containing key-value pairs where the key is the configuration setting you want to override or
  • config_file: The config_file parameter in the get_config_json function is used to specify the path to a configuration file that contains additional settings. If provided, the function will read the contents of this file and update the configuration dictionary with the values found in the file, overriding any existing values with the
Returns

The function get_config_json returns a dictionary containing the configuration settings.

def prioritization( self, table: str = None, pz_prefix: str = None, pz_param: dict = None) -> bool:
6702    def prioritization(
6703        self, table: str = None, pz_prefix: str = None, pz_param: dict = None
6704    ) -> bool:
6705        """
6706        The `prioritization` function in Python processes VCF files, adds new INFO fields, and
6707        prioritizes variants based on configured profiles and criteria.
6708
6709        :param table: The `table` parameter in the `prioritization` function is used to specify the name
6710        of the table (presumably a VCF file) on which the prioritization operation will be performed. If
6711        a table name is provided, the method will prioritize the variants in that specific table
6712        :type table: str
6713        :param pz_prefix: The `pz_prefix` parameter is used to specify a prefix that will be added to
6714        certain INFO fields in a VCF file during the prioritization process. If this parameter is not
6715        provided, the code will use a default prefix value of "PZ"
6716        :type pz_prefix: str
6717        :param pz_param: The `pz_param` parameter in the `prioritization` method is used to pass
6718        additional parameters specific to the prioritization process. These parameters can include
6719        settings related to prioritization profiles, fields, scoring modes, flags, comments, and other
6720        configurations needed for the prioritization of variants in a V
6721        :type pz_param: dict
6722        :return: A boolean value (True) is being returned from the `prioritization` function.
6723        """
6724
6725        # Config
6726        config = self.get_config()
6727
6728        # Param
6729        param = self.get_param()
6730
6731        # Prioritization param
6732        if pz_param is not None:
6733            prioritization_param = pz_param
6734        else:
6735            prioritization_param = param.get("prioritization", {})
6736
6737        # Configuration profiles
6738        prioritization_config_file = prioritization_param.get(
6739            "prioritization_config", None
6740        )
6741        prioritization_config_file = full_path(prioritization_config_file)
6742        prioritizations_config = self.get_config_json(
6743            name="prioritizations", config_file=prioritization_config_file
6744        )
6745
6746        # Prioritization prefix
6747        pz_prefix_default = "PZ"
6748        if pz_prefix is None:
6749            pz_prefix = prioritization_param.get("pzprefix", pz_prefix_default)
6750
6751        # Prioritization options
6752        profiles = prioritization_param.get("profiles", [])
6753        if isinstance(profiles, str):
6754            profiles = profiles.split(",")
6755        pzfields = prioritization_param.get(
6756            "pzfields", [f"{pz_prefix}Flag", f"{pz_prefix}Score"]
6757        )
6758        if isinstance(pzfields, str):
6759            pzfields = pzfields.split(",")
6760        default_profile = prioritization_param.get("default_profile", None)
6761        pzfields_sep = prioritization_param.get("pzfields_sep", "_")
6762        prioritization_score_mode = prioritization_param.get(
6763            "prioritization_score_mode", "HOWARD"
6764        )
6765
6766        # Quick Prioritizations
6767        prioritizations = param.get("prioritizations", None)
6768        if prioritizations:
6769            log.info("Quick Prioritization:")
6770            for profile in prioritizations.split(","):
6771                if profile not in profiles:
6772                    profiles.append(profile)
6773                    log.info(f"   {profile}")
6774
6775        # If profile "ALL" provided, all profiles in the config profiles
6776        if "ALL" in profiles:
6777            profiles = list(prioritizations_config.keys())
6778
6779        for profile in profiles:
6780            if prioritizations_config.get(profile, None):
6781                log.debug(f"Profile '{profile}' configured")
6782            else:
6783                msg_error = f"Profile '{profile}' NOT configured"
6784                log.error(msg_error)
6785                raise ValueError(msg_error)
6786
6787        if profiles:
6788            log.info(f"Prioritization... ")
6789        else:
6790            log.debug(f"No profile defined")
6791            return False
6792
6793        if not default_profile and len(profiles):
6794            default_profile = profiles[0]
6795
6796        log.debug("Profiles availables: " + str(list(prioritizations_config.keys())))
6797        log.debug("Profiles to check: " + str(list(profiles)))
6798
6799        # Variables
6800        if table is not None:
6801            table_variants = table
6802        else:
6803            table_variants = self.get_table_variants(clause="update")
6804        log.debug(f"Table to prioritize: {table_variants}")
6805
6806        # Added columns
6807        added_columns = []
6808
6809        # Create list of PZfields
6810        # List of PZFields
6811        list_of_pzfields_original = pzfields + [
6812            pzfield + pzfields_sep + profile
6813            for pzfield in pzfields
6814            for profile in profiles
6815        ]
6816        list_of_pzfields = []
6817        log.debug(f"{list_of_pzfields_original}")
6818
6819        # Remove existing PZfields to use if exists
6820        for pzfield in list_of_pzfields_original:
6821            if self.get_header().infos.get(pzfield, None) is None:
6822                list_of_pzfields.append(pzfield)
6823                log.debug(f"VCF Input - Header - PZfield '{pzfield}' not in VCF")
6824            else:
6825                log.debug(f"VCF Input - Header - PZfield '{pzfield}' already in VCF")
6826
6827        if list_of_pzfields:
6828
6829            # Explode Infos prefix
6830            explode_infos_prefix = self.get_explode_infos_prefix()
6831
6832            # PZfields tags description
6833            PZfields_INFOS = {
6834                f"{pz_prefix}Tags": {
6835                    "ID": f"{pz_prefix}Tags",
6836                    "Number": ".",
6837                    "Type": "String",
6838                    "Description": "Variant tags based on annotation criteria",
6839                },
6840                f"{pz_prefix}Score": {
6841                    "ID": f"{pz_prefix}Score",
6842                    "Number": 1,
6843                    "Type": "Integer",
6844                    "Description": "Variant score based on annotation criteria",
6845                },
6846                f"{pz_prefix}Flag": {
6847                    "ID": f"{pz_prefix}Flag",
6848                    "Number": 1,
6849                    "Type": "String",
6850                    "Description": "Variant flag based on annotation criteria",
6851                },
6852                f"{pz_prefix}Comment": {
6853                    "ID": f"{pz_prefix}Comment",
6854                    "Number": ".",
6855                    "Type": "String",
6856                    "Description": "Variant comment based on annotation criteria",
6857                },
6858                f"{pz_prefix}Infos": {
6859                    "ID": f"{pz_prefix}Infos",
6860                    "Number": ".",
6861                    "Type": "String",
6862                    "Description": "Variant infos based on annotation criteria",
6863                },
6864                f"{pz_prefix}Class": {
6865                    "ID": f"{pz_prefix}Class",
6866                    "Number": ".",
6867                    "Type": "String",
6868                    "Description": "Variant class based on annotation criteria",
6869                },
6870            }
6871
6872            # Create INFO fields if not exist
6873            for field in PZfields_INFOS:
6874                field_ID = PZfields_INFOS[field]["ID"]
6875                field_description = PZfields_INFOS[field]["Description"]
6876                if field_ID not in self.get_header().infos and field_ID in pzfields:
6877                    field_description = (
6878                        PZfields_INFOS[field]["Description"]
6879                        + f", profile {default_profile}"
6880                    )
6881                    self.get_header().infos[field_ID] = vcf.parser._Info(
6882                        field_ID,
6883                        PZfields_INFOS[field]["Number"],
6884                        PZfields_INFOS[field]["Type"],
6885                        field_description,
6886                        "unknown",
6887                        "unknown",
6888                        code_type_map[PZfields_INFOS[field]["Type"]],
6889                    )
6890
6891            # Create INFO fields if not exist for each profile
6892            for profile in prioritizations_config:
6893                if profile in profiles or profiles == []:
6894                    for field in PZfields_INFOS:
6895                        field_ID = PZfields_INFOS[field]["ID"] + pzfields_sep + profile
6896                        field_description = (
6897                            PZfields_INFOS[field]["Description"]
6898                            + f", profile {profile}"
6899                        )
6900                        if (
6901                            field_ID not in self.get_header().infos
6902                            and field in pzfields
6903                        ):
6904                            self.get_header().infos[field_ID] = vcf.parser._Info(
6905                                field_ID,
6906                                PZfields_INFOS[field]["Number"],
6907                                PZfields_INFOS[field]["Type"],
6908                                field_description,
6909                                "unknown",
6910                                "unknown",
6911                                code_type_map[PZfields_INFOS[field]["Type"]],
6912                            )
6913
6914            # Header
6915            for pzfield in list_of_pzfields:
6916                if re.match(f"{pz_prefix}Score.*", pzfield):
6917                    added_column = self.add_column(
6918                        table_name=table_variants,
6919                        column_name=pzfield,
6920                        column_type="INTEGER",
6921                        default_value="0",
6922                    )
6923                elif re.match(f"{pz_prefix}Flag.*", pzfield):
6924                    added_column = self.add_column(
6925                        table_name=table_variants,
6926                        column_name=pzfield,
6927                        column_type="BOOLEAN",
6928                        default_value="1",
6929                    )
6930                elif re.match(f"{pz_prefix}Class.*", pzfield):
6931                    added_column = self.add_column(
6932                        table_name=table_variants,
6933                        column_name=pzfield,
6934                        column_type="VARCHAR[]",
6935                        default_value="null",
6936                    )
6937                else:
6938                    added_column = self.add_column(
6939                        table_name=table_variants,
6940                        column_name=pzfield,
6941                        column_type="STRING",
6942                        default_value="''",
6943                    )
6944                added_columns.append(added_column)
6945
6946            # Profiles
6947            if profiles:
6948
6949                # foreach profile in configuration file
6950                for profile in prioritizations_config:
6951
6952                    # If profile is asked in param, or ALL are asked (empty profile [])
6953                    if profile in profiles or profiles == []:
6954                        log.info(f"Profile '{profile}'")
6955
6956                        sql_set_info_option = ""
6957
6958                        sql_set_info = []
6959
6960                        # PZ fields set
6961
6962                        # PZScore
6963                        if (
6964                            f"{pz_prefix}Score{pzfields_sep}{profile}"
6965                            in list_of_pzfields
6966                        ):
6967                            sql_set_info.append(
6968                                f"""
6969                                    concat(
6970                                        '{pz_prefix}Score{pzfields_sep}{profile}=',
6971                                        {pz_prefix}Score{pzfields_sep}{profile}
6972                                    ) 
6973                                """
6974                            )
6975                            if (
6976                                profile == default_profile
6977                                and f"{pz_prefix}Score" in list_of_pzfields
6978                            ):
6979                                sql_set_info.append(
6980                                    f"""
6981                                        concat(
6982                                            '{pz_prefix}Score=',
6983                                            {pz_prefix}Score{pzfields_sep}{profile}
6984                                        )
6985                                    """
6986                                )
6987
6988                        # PZFlag
6989                        if (
6990                            f"{pz_prefix}Flag{pzfields_sep}{profile}"
6991                            in list_of_pzfields
6992                        ):
6993                            sql_set_info.append(
6994                                f"""
6995                                    concat(
6996                                        '{pz_prefix}Flag{pzfields_sep}{profile}=',
6997                                        CASE 
6998                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
6999                                            THEN 'PASS'
7000                                            WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7001                                            THEN 'FILTERED'
7002                                        END
7003                                    ) 
7004                                """
7005                            )
7006                            if (
7007                                profile == default_profile
7008                                and f"{pz_prefix}Flag" in list_of_pzfields
7009                            ):
7010                                sql_set_info.append(
7011                                    f"""
7012                                        concat(
7013                                            '{pz_prefix}Flag=',
7014                                            CASE 
7015                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==1
7016                                                THEN 'PASS'
7017                                                WHEN {pz_prefix}Flag{pzfields_sep}{profile}==0
7018                                                THEN 'FILTERED'
7019                                            END
7020                                        )
7021                                    """
7022                                )
7023
7024                        # PZClass
7025                        if (
7026                            f"{pz_prefix}Class{pzfields_sep}{profile}"
7027                            in list_of_pzfields
7028                        ):
7029                            sql_set_info.append(
7030                                f"""
7031                                    concat(
7032                                        '{pz_prefix}Class{pzfields_sep}{profile}=',
7033                                        CASE
7034                                            WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7035                                            THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7036                                            ELSE '.'
7037                                        END 
7038                                    )
7039                                    
7040                                """
7041                            )
7042                            if (
7043                                profile == default_profile
7044                                and f"{pz_prefix}Class" in list_of_pzfields
7045                            ):
7046                                sql_set_info.append(
7047                                    f"""
7048                                        concat(
7049                                            '{pz_prefix}Class=',
7050                                            CASE
7051                                                WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7052                                                THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7053                                                ELSE '.'
7054                                            END 
7055                                        )
7056                                    """
7057                                )
7058
7059                        # PZComment
7060                        if (
7061                            f"{pz_prefix}Comment{pzfields_sep}{profile}"
7062                            in list_of_pzfields
7063                        ):
7064                            sql_set_info.append(
7065                                f"""
7066                                    CASE
7067                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7068                                        THEN concat('{pz_prefix}Comment{pzfields_sep}{profile}=', {pz_prefix}Comment{pzfields_sep}{profile})
7069                                        ELSE ''
7070                                    END
7071                                """
7072                            )
7073                            if (
7074                                profile == default_profile
7075                                and f"{pz_prefix}Comment" in list_of_pzfields
7076                            ):
7077                                sql_set_info.append(
7078                                    f"""
7079                                        CASE
7080                                            WHEN {pz_prefix}Comment{pzfields_sep}{profile} NOT IN ('')
7081                                            THEN concat('{pz_prefix}Comment=', {pz_prefix}Comment{pzfields_sep}{profile})
7082                                            ELSE ''
7083                                        END
7084                                    """
7085                                )
7086
7087                        # PZInfos
7088                        if (
7089                            f"{pz_prefix}Infos{pzfields_sep}{profile}"
7090                            in list_of_pzfields
7091                        ):
7092                            sql_set_info.append(
7093                                f"""
7094                                    CASE
7095                                        WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7096                                        THEN concat('{pz_prefix}Infos{pzfields_sep}{profile}=', {pz_prefix}Infos{pzfields_sep}{profile})
7097                                        ELSE ''
7098                                    END
7099                                """
7100                            )
7101                            if (
7102                                profile == default_profile
7103                                and f"{pz_prefix}Infos" in list_of_pzfields
7104                            ):
7105                                sql_set_info.append(
7106                                    f"""
7107                                        CASE
7108                                            WHEN {pz_prefix}Infos{pzfields_sep}{profile} NOT IN ('')
7109                                            THEN concat('{pz_prefix}Infos=', {pz_prefix}Infos{pzfields_sep}{profile})
7110                                            ELSE ''
7111                                        END
7112                                    """
7113                                )
7114
7115                        # Merge PZfields
7116                        sql_set_info_option = ""
7117                        sql_set_sep = ""
7118                        for sql_set in sql_set_info:
7119                            if sql_set_sep:
7120                                sql_set_info_option += f"""
7121                                    , concat('{sql_set_sep}', {sql_set})
7122                                """
7123                            else:
7124                                sql_set_info_option += f"""
7125                                    , {sql_set}
7126                                """
7127                            sql_set_sep = ";"
7128
7129                        sql_queries = []
7130                        for annotation in prioritizations_config[profile]:
7131
7132                            # skip special sections
7133                            if annotation.startswith("_"):
7134                                continue
7135
7136                            # For each criterions
7137                            for criterion in prioritizations_config[profile][
7138                                annotation
7139                            ]:
7140
7141                                # Criterion mode
7142                                criterion_mode = None
7143                                if np.any(
7144                                    np.isin(list(criterion.keys()), ["type", "value"])
7145                                ):
7146                                    criterion_mode = "operation"
7147                                elif np.any(
7148                                    np.isin(list(criterion.keys()), ["sql", "fields"])
7149                                ):
7150                                    criterion_mode = "sql"
7151                                log.debug(f"Criterion Mode: {criterion_mode}")
7152
7153                                # Criterion parameters
7154                                criterion_type = criterion.get("type", None)
7155                                criterion_value = criterion.get("value", None)
7156                                criterion_sql = criterion.get("sql", None)
7157                                criterion_fields = criterion.get("fields", None)
7158                                criterion_score = criterion.get("score", 0)
7159                                criterion_flag = criterion.get("flag", "PASS")
7160                                criterion_class = criterion.get("class", None)
7161                                criterion_flag_bool = criterion_flag == "PASS"
7162                                criterion_comment = (
7163                                    ", ".join(criterion.get("comment", []))
7164                                    .replace("'", "''")
7165                                    .replace(";", ",")
7166                                    .replace("\t", " ")
7167                                )
7168                                criterion_infos = (
7169                                    str(criterion)
7170                                    .replace("'", "''")
7171                                    .replace(";", ",")
7172                                    .replace("\t", " ")
7173                                )
7174
7175                                # SQL
7176                                if criterion_sql is not None and isinstance(
7177                                    criterion_sql, list
7178                                ):
7179                                    criterion_sql = " ".join(criterion_sql)
7180
7181                                # Fields and explode
7182                                if criterion_fields is None:
7183                                    criterion_fields = [annotation]
7184                                if not isinstance(criterion_fields, list):
7185                                    criterion_fields = str(criterion_fields).split(",")
7186
7187                                # Class
7188                                if criterion_class is not None and not isinstance(
7189                                    criterion_class, list
7190                                ):
7191                                    criterion_class = str(criterion_class).split(",")
7192
7193                                for annotation_field in criterion_fields:
7194
7195                                    # Explode specific annotation
7196                                    log.debug(
7197                                        f"Explode annotation '{annotation_field}'"
7198                                    )
7199                                    added_columns += self.explode_infos(
7200                                        prefix=explode_infos_prefix,
7201                                        fields=[annotation_field],
7202                                        table=table_variants,
7203                                    )
7204                                    extra_infos = self.get_extra_infos(
7205                                        table=table_variants
7206                                    )
7207
7208                                    # Check if annotation field is present
7209                                    if (
7210                                        f"{explode_infos_prefix}{annotation_field}"
7211                                        not in extra_infos
7212                                    ):
7213                                        msq_err = f"Annotation '{annotation_field}' not in data"
7214                                        log.error(msq_err)
7215                                        raise ValueError(msq_err)
7216                                    else:
7217                                        log.debug(
7218                                            f"Annotation '{annotation_field}' in data"
7219                                        )
7220
7221                                sql_set = []
7222                                sql_set_info = []
7223
7224                                # PZ fields set
7225
7226                                # PZScore
7227                                if (
7228                                    f"{pz_prefix}Score{pzfields_sep}{profile}"
7229                                    in list_of_pzfields
7230                                ):
7231                                    # if prioritization_score_mode == "HOWARD":
7232                                    #     sql_set.append(
7233                                    #         f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7234                                    #     )
7235                                    # VaRank prioritization score mode
7236                                    if prioritization_score_mode == "VaRank":
7237                                        sql_set.append(
7238                                            f"{pz_prefix}Score{pzfields_sep}{profile} = CASE WHEN {criterion_score}>{pz_prefix}Score{pzfields_sep}{profile} THEN {criterion_score} END"
7239                                        )
7240                                    # default HOWARD prioritization score mode
7241                                    else:
7242                                        sql_set.append(
7243                                            f"{pz_prefix}Score{pzfields_sep}{profile} = {pz_prefix}Score{pzfields_sep}{profile} + {criterion_score}"
7244                                        )
7245
7246                                # PZFlag
7247                                if (
7248                                    f"{pz_prefix}Flag{pzfields_sep}{profile}"
7249                                    in list_of_pzfields
7250                                ):
7251                                    sql_set.append(
7252                                        f"{pz_prefix}Flag{pzfields_sep}{profile} = {pz_prefix}Flag{pzfields_sep}{profile} AND {criterion_flag_bool}"
7253                                    )
7254
7255                                # PZClass
7256                                if (
7257                                    f"{pz_prefix}Class{pzfields_sep}{profile}"
7258                                    in list_of_pzfields
7259                                    and criterion_class is not None
7260                                ):
7261                                    sql_set.append(
7262                                        f" {pz_prefix}Class{pzfields_sep}{profile} = list_concat(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), {criterion_class}) "
7263                                    )
7264
7265                                # PZComment
7266                                if (
7267                                    f"{pz_prefix}Comment{pzfields_sep}{profile}"
7268                                    in list_of_pzfields
7269                                ):
7270                                    sql_set.append(
7271                                        f"""
7272                                            {pz_prefix}Comment{pzfields_sep}{profile} = 
7273                                                concat(
7274                                                    {pz_prefix}Comment{pzfields_sep}{profile},
7275                                                    CASE 
7276                                                        WHEN {pz_prefix}Comment{pzfields_sep}{profile}!=''
7277                                                        THEN ', '
7278                                                        ELSE ''
7279                                                    END,
7280                                                    '{criterion_comment}'
7281                                                )
7282                                        """
7283                                    )
7284
7285                                # PZInfos
7286                                if (
7287                                    f"{pz_prefix}Infos{pzfields_sep}{profile}"
7288                                    in list_of_pzfields
7289                                ):
7290                                    sql_set.append(
7291                                        f"""
7292                                            {pz_prefix}Infos{pzfields_sep}{profile} = 
7293                                                concat(
7294                                                    {pz_prefix}Infos{pzfields_sep}{profile},
7295                                                    '{criterion_infos}'
7296                                                )
7297                                        """
7298                                    )
7299                                sql_set_option = ",".join(sql_set)
7300
7301                                # Criterion and comparison
7302                                if sql_set_option:
7303
7304                                    if criterion_mode in ["operation"]:
7305
7306                                        try:
7307                                            float(criterion_value)
7308                                            sql_update = f"""
7309                                                UPDATE {table_variants}
7310                                                SET {sql_set_option}
7311                                                WHERE CAST("{explode_infos_prefix}{annotation}" AS VARCHAR) NOT IN ('','.')
7312                                                AND CAST("{explode_infos_prefix}{annotation}" AS FLOAT){comparison_map[criterion_type]}{criterion_value}
7313                                            """
7314                                        except:
7315                                            contains_option = ""
7316                                            if criterion_type == "contains":
7317                                                contains_option = ".*"
7318                                            sql_update = f"""
7319                                                UPDATE {table_variants}
7320                                                SET {sql_set_option}
7321                                                WHERE "{explode_infos_prefix}{annotation}" SIMILAR TO '{contains_option}{criterion_value}{contains_option}'
7322                                            """
7323                                        sql_queries.append(sql_update)
7324
7325                                    elif criterion_mode in ["sql"]:
7326
7327                                        sql_update = f"""
7328                                            UPDATE {table_variants}
7329                                            SET {sql_set_option}
7330                                            WHERE {criterion_sql}
7331                                        """
7332                                        sql_queries.append(sql_update)
7333
7334                                    else:
7335                                        msg_err = f"Prioritization criterion mode failed (either 'operation' or 'sql')"
7336                                        log.error(msg_err)
7337                                        raise ValueError(msg_err)
7338
7339                                else:
7340                                    log.warning(
7341                                        f"NO SQL SET option for '{annotation}' - '{criterion}'"
7342                                    )
7343
7344                        # PZTags
7345                        if (
7346                            f"{pz_prefix}Tags{pzfields_sep}{profile}"
7347                            in list_of_pzfields
7348                        ):
7349
7350                            # Create PZFalgs value
7351                            pztags_value = ""
7352                            pztags_sep_default = ","
7353                            pztags_sep = ""
7354                            for pzfield in pzfields:
7355                                if pzfield not in [f"{pz_prefix}Tags"]:
7356                                    if (
7357                                        f"{pzfield}{pzfields_sep}{profile}"
7358                                        in list_of_pzfields
7359                                    ):
7360                                        if pzfield in [f"{pz_prefix}Flag"]:
7361                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7362                                                CASE WHEN {pz_prefix}Flag{pzfields_sep}{profile}
7363                                                    THEN 'PASS'
7364                                                    ELSE 'FILTERED'
7365                                                END, '"""
7366                                        elif pzfield in [f"{pz_prefix}Class"]:
7367                                            pztags_value += f"""{pztags_sep}{pzfield}#', 
7368                                                CASE WHEN len({pz_prefix}Class{pzfields_sep}{profile}) > 0
7369                                                    THEN list_aggregate(list_distinct({pz_prefix}Class{pzfields_sep}{profile}), 'string_agg', ',')
7370                                                    ELSE '.'
7371                                                END, '"""
7372                                        else:
7373                                            pztags_value += f"{pztags_sep}{pzfield}#', {pzfield}{pzfields_sep}{profile}, '"
7374                                        pztags_sep = pztags_sep_default
7375
7376                            # Add Query update for PZFlags
7377                            sql_update_pztags = f"""
7378                                UPDATE {table_variants}
7379                                SET INFO = concat(
7380                                        INFO,
7381                                        CASE WHEN INFO NOT in ('','.')
7382                                                THEN ';'
7383                                                ELSE ''
7384                                        END,
7385                                        '{pz_prefix}Tags{pzfields_sep}{profile}={pztags_value}'
7386                                    )
7387                                """
7388                            sql_queries.append(sql_update_pztags)
7389
7390                            # Add Query update for PZFlags for default
7391                            if profile == default_profile:
7392                                sql_update_pztags_default = f"""
7393                                UPDATE {table_variants}
7394                                SET INFO = concat(
7395                                        INFO,
7396                                        ';',
7397                                        '{pz_prefix}Tags={pztags_value}'
7398                                    )
7399                                """
7400                                sql_queries.append(sql_update_pztags_default)
7401
7402                        log.info(f"""Profile '{profile}' - Prioritization... """)
7403
7404                        if sql_queries:
7405
7406                            for sql_query in sql_queries:
7407                                log.debug(
7408                                    f"""Profile '{profile}' - Prioritization query: {sql_query}... """
7409                                )
7410                                self.conn.execute(sql_query)
7411
7412                        log.info(f"""Profile '{profile}' - Update... """)
7413                        sql_query_update = f"""
7414                            UPDATE {table_variants}
7415                            SET INFO =  
7416                                concat(
7417                                    CASE
7418                                        WHEN INFO NOT IN ('','.')
7419                                        THEN concat(INFO, ';')
7420                                        ELSE ''
7421                                    END
7422                                    {sql_set_info_option}
7423                                )
7424                        """
7425                        self.conn.execute(sql_query_update)
7426
7427        else:
7428
7429            log.warning(f"No profiles in parameters")
7430
7431        # Remove added columns
7432        for added_column in added_columns:
7433            self.drop_column(column=added_column)
7434
7435        # Explode INFOS fields into table fields
7436        if self.get_explode_infos():
7437            self.explode_infos(
7438                prefix=self.get_explode_infos_prefix(),
7439                fields=self.get_explode_infos_fields(),
7440                force=True,
7441            )
7442
7443        return True

The prioritization function in Python processes VCF files, adds new INFO fields, and prioritizes variants based on configured profiles and criteria.

Parameters
  • table: The table parameter in the prioritization function is used to specify the name of the table (presumably a VCF file) on which the prioritization operation will be performed. If a table name is provided, the method will prioritize the variants in that specific table
  • pz_prefix: The pz_prefix parameter is used to specify a prefix that will be added to certain INFO fields in a VCF file during the prioritization process. If this parameter is not provided, the code will use a default prefix value of "PZ"
  • pz_param: The pz_param parameter in the prioritization method is used to pass additional parameters specific to the prioritization process. These parameters can include settings related to prioritization profiles, fields, scoring modes, flags, comments, and other configurations needed for the prioritization of variants in a V
Returns

A boolean value (True) is being returned from the prioritization function.

def annotation_hgvs(self, threads: int = None) -> None:
7449    def annotation_hgvs(self, threads: int = None) -> None:
7450        """
7451        The `annotation_hgvs` function performs HGVS annotation on a set of variants using genomic
7452        coordinates and alleles.
7453
7454        :param threads: The `threads` parameter is an optional integer that specifies the number of
7455        threads to use for parallel processing. If no value is provided, it will default to the number
7456        of threads obtained from the `get_threads()` method
7457        :type threads: int
7458        """
7459
7460        # Function for each partition of the Dask Dataframe
7461        def partition_function(partition):
7462            """
7463            The function `partition_function` applies the `annotation_hgvs_partition` function to
7464            each row of a DataFrame called `partition`.
7465
7466            :param partition: The parameter "partition" is a pandas DataFrame that contains the data
7467            to be processed
7468            :return: the result of applying the "annotation_hgvs_partition" function to each row of
7469            the "partition" dataframe along the axis 1.
7470            """
7471            return partition.apply(annotation_hgvs_partition, axis=1)
7472
7473        def annotation_hgvs_partition(row) -> str:
7474            """
7475            The function `annotation_hgvs_partition` takes in a row of data and returns a string
7476            containing a list of HGVS names associated with the given genomic coordinates and alleles.
7477
7478            :param row: A dictionary-like object that contains the values for the following keys:
7479            :return: a string that contains the HGVS names associated with the given row of data.
7480            """
7481
7482            chr = row["CHROM"]
7483            pos = row["POS"]
7484            ref = row["REF"]
7485            alt = row["ALT"]
7486
7487            # Find list of associated transcripts
7488            transcripts_list = list(
7489                polars_conn.execute(
7490                    f"""
7491                SELECT transcript
7492                FROM refseq_df
7493                WHERE CHROM='{chr}'
7494                AND POS={pos}
7495            """
7496                )["transcript"]
7497            )
7498
7499            # Full HGVS annotation in list
7500            hgvs_full_list = []
7501
7502            for transcript_name in transcripts_list:
7503
7504                # Transcript
7505                transcript = get_transcript(
7506                    transcripts=transcripts, transcript_name=transcript_name
7507                )
7508                # Exon
7509                if use_exon:
7510                    exon = transcript.find_exon_number(pos)
7511                else:
7512                    exon = None
7513                # Protein
7514                transcript_protein = None
7515                if use_protein or add_protein or full_format:
7516                    transcripts_protein = list(
7517                        polars_conn.execute(
7518                            f"""
7519                        SELECT protein
7520                        FROM refseqlink_df
7521                        WHERE transcript='{transcript_name}'
7522                        LIMIT 1
7523                    """
7524                        )["protein"]
7525                    )
7526                    if len(transcripts_protein):
7527                        transcript_protein = transcripts_protein[0]
7528
7529                # HGVS name
7530                hgvs_name = format_hgvs_name(
7531                    chr,
7532                    pos,
7533                    ref,
7534                    alt,
7535                    genome=genome,
7536                    transcript=transcript,
7537                    transcript_protein=transcript_protein,
7538                    exon=exon,
7539                    use_gene=use_gene,
7540                    use_protein=use_protein,
7541                    full_format=full_format,
7542                    use_version=use_version,
7543                    codon_type=codon_type,
7544                )
7545                hgvs_full_list.append(hgvs_name)
7546                if add_protein and not use_protein and not full_format:
7547                    hgvs_name = format_hgvs_name(
7548                        chr,
7549                        pos,
7550                        ref,
7551                        alt,
7552                        genome=genome,
7553                        transcript=transcript,
7554                        transcript_protein=transcript_protein,
7555                        exon=exon,
7556                        use_gene=use_gene,
7557                        use_protein=True,
7558                        full_format=False,
7559                        use_version=use_version,
7560                        codon_type=codon_type,
7561                    )
7562                    hgvs_full_list.append(hgvs_name)
7563
7564            # Create liste of HGVS annotations
7565            hgvs_full = ",".join(hgvs_full_list)
7566
7567            return hgvs_full
7568
7569        # Polars connexion
7570        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7571
7572        # Config
7573        config = self.get_config()
7574
7575        # Databases
7576        # Genome
7577        databases_genomes_folders = (
7578            config.get("folders", {})
7579            .get("databases", {})
7580            .get("genomes", DEFAULT_GENOME_FOLDER)
7581        )
7582        databases_genome = (
7583            config.get("folders", {}).get("databases", {}).get("genomes", "")
7584        )
7585        # refseq database folder
7586        databases_refseq_folders = (
7587            config.get("folders", {})
7588            .get("databases", {})
7589            .get("refseq", DEFAULT_REFSEQ_FOLDER)
7590        )
7591        # refseq
7592        databases_refseq = config.get("databases", {}).get("refSeq", None)
7593        # refSeqLink
7594        databases_refseqlink = config.get("databases", {}).get("refSeqLink", None)
7595
7596        # Param
7597        param = self.get_param()
7598
7599        # Quick HGVS
7600        if "hgvs_options" in param and param.get("hgvs_options", ""):
7601            log.info(f"Quick HGVS Annotation:")
7602            if not param.get("hgvs", None):
7603                param["hgvs"] = {}
7604            for option in param.get("hgvs_options", "").split(","):
7605                option_var_val = option.split("=")
7606                option_var = option_var_val[0]
7607                if len(option_var_val) > 1:
7608                    option_val = option_var_val[1]
7609                else:
7610                    option_val = "True"
7611                if option_val.upper() in ["TRUE"]:
7612                    option_val = True
7613                elif option_val.upper() in ["FALSE"]:
7614                    option_val = False
7615                log.info(f"   {option_var}={option_val}")
7616                param["hgvs"][option_var] = option_val
7617
7618        # Check if HGVS annotation enabled
7619        if "hgvs" in param:
7620            log.info(f"HGVS Annotation... ")
7621            for hgvs_option in param.get("hgvs", {}):
7622                log.info(f"{hgvs_option}: {param.get('hgvs',{}).get(hgvs_option)}")
7623        else:
7624            return
7625
7626        # HGVS Param
7627        param_hgvs = param.get("hgvs", {})
7628        use_exon = param_hgvs.get("use_exon", False)
7629        use_gene = param_hgvs.get("use_gene", False)
7630        use_protein = param_hgvs.get("use_protein", False)
7631        add_protein = param_hgvs.get("add_protein", False)
7632        full_format = param_hgvs.get("full_format", False)
7633        use_version = param_hgvs.get("use_version", False)
7634        codon_type = param_hgvs.get("codon_type", "3")
7635
7636        # refSseq refSeqLink
7637        databases_refseq = param_hgvs.get("refseq", databases_refseq)
7638        databases_refseqlink = param_hgvs.get("refseqlink", databases_refseqlink)
7639
7640        # Assembly
7641        assembly = param.get("assembly", config.get("assembly", DEFAULT_ASSEMBLY))
7642
7643        # Genome
7644        genome_file = None
7645        if find_genome(databases_genome):
7646            genome_file = find_genome(databases_genome)
7647        else:
7648            genome_file = find_genome(
7649                genome_path=databases_genomes_folders, assembly=assembly
7650            )
7651        log.debug("Genome: " + str(genome_file))
7652
7653        # refSseq
7654        refseq_file = find_file_prefix(
7655            input_file=databases_refseq,
7656            prefix="ncbiRefSeq",
7657            folder=databases_refseq_folders,
7658            assembly=assembly,
7659        )
7660        log.debug("refSeq: " + str(refseq_file))
7661
7662        # refSeqLink
7663        refseqlink_file = find_file_prefix(
7664            input_file=databases_refseqlink,
7665            prefix="ncbiRefSeqLink",
7666            folder=databases_refseq_folders,
7667            assembly=assembly,
7668        )
7669        log.debug("refSeqLink: " + str(refseqlink_file))
7670
7671        # Threads
7672        if not threads:
7673            threads = self.get_threads()
7674        log.debug("Threads: " + str(threads))
7675
7676        # Variables
7677        table_variants = self.get_table_variants(clause="update")
7678
7679        # Get variants SNV and InDel only
7680        query_variants = f"""
7681            SELECT "#CHROM" AS CHROM, POS, REF, ALT
7682            FROM {table_variants}
7683            WHERE REF ~ '^[A-Za-z]+$' AND ALT ~ '^[A-Za-z]+$'
7684            """
7685        df_variants = self.get_query_to_df(query_variants)
7686
7687        # Added columns
7688        added_columns = []
7689
7690        # Add hgvs column in variants table
7691        hgvs_column_name = "hgvs_" + str(random.randrange(1000))
7692        added_column = self.add_column(
7693            table_variants, hgvs_column_name, "STRING", default_value=None
7694        )
7695        added_columns.append(added_column)
7696
7697        log.debug(f"refSeq loading...")
7698        # refSeq in duckDB
7699        refseq_table = get_refseq_table(
7700            conn=self.conn, refseq_table="refseq", refseq_file=refseq_file
7701        )
7702        # Loading all refSeq in Dataframe
7703        refseq_query = f"""
7704            SELECT df_variants.CHROM, df_variants.POS, {refseq_table}.name AS transcript
7705            FROM {refseq_table}
7706            JOIN df_variants ON (
7707                {refseq_table}.chrom = df_variants.CHROM
7708                AND {refseq_table}.txStart<=df_variants.POS
7709                AND {refseq_table}.txEnd>=df_variants.POS
7710            )
7711        """
7712        refseq_df = self.conn.query(refseq_query).pl()
7713
7714        if refseqlink_file:
7715            log.debug(f"refSeqLink loading...")
7716            # refSeqLink in duckDB
7717            refseqlink_table = get_refseq_table(
7718                conn=self.conn, refseq_table="refseqlink", refseq_file=refseqlink_file
7719            )
7720            # Loading all refSeqLink in Dataframe
7721            protacc_column = "protAcc_with_ver"
7722            mrnaacc_column = "mrnaAcc_with_ver"
7723            refseqlink_query = f"""
7724                SELECT {refseq_table}.chrom, {protacc_column} AS protein, {mrnaacc_column} AS transcript
7725                FROM {refseqlink_table} 
7726                JOIN {refseq_table} ON ({refseq_table}.name = {refseqlink_table}.mrnaAcc_with_ver)
7727                WHERE protAcc_without_ver IS NOT NULL
7728            """
7729            # Polars Dataframe
7730            refseqlink_df = self.conn.query(f"{refseqlink_query}").pl()
7731
7732        # Read RefSeq transcripts into a python dict/model.
7733        log.debug(f"Transcripts loading...")
7734        with tempfile.TemporaryDirectory() as tmpdir:
7735            transcripts_query = f"""
7736                COPY (
7737                    SELECT {refseq_table}.*
7738                    FROM {refseq_table}
7739                    JOIN df_variants ON (
7740                        {refseq_table}.chrom=df_variants.CHROM
7741                        AND {refseq_table}.txStart<=df_variants.POS
7742                        AND {refseq_table}.txEnd>=df_variants.POS
7743                    )
7744                )
7745                TO '{tmpdir}/transcript.tsv' (DELIMITER '\t');
7746            """
7747            self.conn.query(transcripts_query)
7748            with open(f"{tmpdir}/transcript.tsv") as infile:
7749                transcripts = read_transcripts(infile)
7750
7751        # Polars connexion
7752        polars_conn = pl.SQLContext(register_globals=True, eager=True)
7753
7754        log.debug("Genome loading...")
7755        # Read genome sequence using pyfaidx.
7756        genome = Fasta(genome_file)
7757
7758        log.debug("Start annotation HGVS...")
7759
7760        # Create
7761        # a Dask Dataframe from Pandas dataframe with partition as number of threads
7762        ddf = dd.from_pandas(df_variants, npartitions=threads)
7763
7764        # Use dask.dataframe.apply() to apply function on each partition
7765        ddf[hgvs_column_name] = ddf.map_partitions(partition_function)
7766
7767        # Convert Dask DataFrame to Pandas Dataframe
7768        df = ddf.compute()
7769
7770        # Convert Pandas dataframe to parquet (due to error in cast VARCHAR -> NULL ???)
7771        with tempfile.TemporaryDirectory() as tmpdir:
7772            df_parquet = os.path.join(tmpdir, "df.parquet")
7773            df.to_parquet(df_parquet)
7774
7775            # Update hgvs column
7776            update_variant_query = f"""
7777                UPDATE {table_variants}
7778                SET "{hgvs_column_name}"=df."{hgvs_column_name}"
7779                FROM read_parquet('{df_parquet}') as df
7780                WHERE variants."#CHROM" = df.CHROM
7781                AND variants.POS = df.POS
7782                AND variants.REF = df.REF
7783                AND variants.ALT = df.ALT
7784                AND df."{hgvs_column_name}" NOT IN ('') AND df."{hgvs_column_name}" NOT NULL
7785                """
7786            self.execute_query(update_variant_query)
7787
7788        # Update INFO column
7789        sql_query_update = f"""
7790            UPDATE {table_variants}
7791            SET INFO = 
7792                concat(
7793                    CASE 
7794                        WHEN INFO NOT IN ('','.')
7795                        THEN concat(INFO, ';')
7796                        ELSE ''
7797                    END,
7798                    'hgvs=',
7799                    {hgvs_column_name}
7800                )
7801            WHERE "{hgvs_column_name}" NOT IN ('') AND "{hgvs_column_name}" NOT NULL
7802            """
7803        self.execute_query(sql_query_update)
7804
7805        # Add header
7806        HGVS_INFOS = {
7807            "hgvs": {
7808                "ID": "hgvs",
7809                "Number": ".",
7810                "Type": "String",
7811                "Description": f"HGVS annotatation with HOWARD",
7812            }
7813        }
7814
7815        for field in HGVS_INFOS:
7816            field_ID = HGVS_INFOS[field]["ID"]
7817            field_description = HGVS_INFOS[field]["Description"]
7818            self.get_header().infos[field_ID] = vcf.parser._Info(
7819                field_ID,
7820                HGVS_INFOS[field]["Number"],
7821                HGVS_INFOS[field]["Type"],
7822                field_description,
7823                "unknown",
7824                "unknown",
7825                code_type_map[HGVS_INFOS[field]["Type"]],
7826            )
7827
7828        # Remove added columns
7829        for added_column in added_columns:
7830            self.drop_column(column=added_column)

The annotation_hgvs function performs HGVS annotation on a set of variants using genomic coordinates and alleles.

Parameters
  • threads: The threads parameter is an optional integer that specifies the number of threads to use for parallel processing. If no value is provided, it will default to the number of threads obtained from the get_threads() method
def get_operations_help( self, operations_config_dict: dict = {}, operations_config_file: str = None) -> list:
7836    def get_operations_help(
7837        self, operations_config_dict: dict = {}, operations_config_file: str = None
7838    ) -> list:
7839
7840        # Init
7841        operations_help = []
7842
7843        # operations
7844        operations = self.get_config_json(
7845            name="calculations",
7846            config_dict=operations_config_dict,
7847            config_file=operations_config_file,
7848        )
7849        for op in operations:
7850            op_name = operations[op].get("name", op).upper()
7851            op_description = operations[op].get("description", op_name)
7852            op_available = operations[op].get("available", False)
7853            if op_available:
7854                operations_help.append(f"   {op_name}: {op_description}")
7855
7856        # Sort operations
7857        operations_help.sort()
7858
7859        # insert header
7860        operations_help.insert(0, "Available calculation operations:")
7861
7862        # Return
7863        return operations_help
def calculation( self, operations: dict = {}, operations_config_dict: dict = {}, operations_config_file: str = None) -> None:
7865    def calculation(
7866        self,
7867        operations: dict = {},
7868        operations_config_dict: dict = {},
7869        operations_config_file: str = None,
7870    ) -> None:
7871        """
7872        It takes a list of operations, and for each operation, it checks if it's a python or sql
7873        operation, and then calls the appropriate function
7874
7875        param json example:
7876            "calculation": {
7877                "NOMEN": {
7878                    "options": {
7879                        "hgvs_field": "hgvs"
7880                    },
7881                "middle" : null
7882            }
7883        """
7884
7885        # Param
7886        param = self.get_param()
7887
7888        # operations config
7889        operations_config = self.get_config_json(
7890            name="calculations",
7891            config_dict=operations_config_dict,
7892            config_file=operations_config_file,
7893        )
7894
7895        # Upper keys
7896        operations_config = {k.upper(): v for k, v in operations_config.items()}
7897
7898        # Calculations
7899
7900        # Operations from param
7901        operations = param.get("calculation", {}).get("calculations", operations)
7902
7903        # Quick calculation - add
7904        if param.get("calculations", None):
7905            calculations_list = [
7906                value for value in param.get("calculations", "").split(",")
7907            ]
7908            log.info(f"Quick Calculations:")
7909            for calculation_key in calculations_list:
7910                log.info(f"   {calculation_key}")
7911            for calculation_operation in calculations_list:
7912                if calculation_operation.upper() not in operations:
7913                    operations[calculation_operation.upper()] = {}
7914                    add_value_into_dict(
7915                        dict_tree=param,
7916                        sections=[
7917                            "calculation",
7918                            "calculations",
7919                            calculation_operation.upper(),
7920                        ],
7921                        value={},
7922                    )
7923
7924        # Operations for calculation
7925        if not operations:
7926            operations = param.get("calculation", {}).get("calculations", {})
7927
7928        if operations:
7929            log.info(f"Calculations...")
7930
7931        # For each operations
7932        for operation_name in operations:
7933            operation_name = operation_name.upper()
7934            if operation_name not in [""]:
7935                if operation_name in operations_config:
7936                    log.info(f"Calculation '{operation_name}'")
7937                    operation = operations_config[operation_name]
7938                    operation_type = operation.get("type", "sql")
7939                    if operation_type == "python":
7940                        self.calculation_process_function(
7941                            operation=operation, operation_name=operation_name
7942                        )
7943                    elif operation_type == "sql":
7944                        self.calculation_process_sql(
7945                            operation=operation, operation_name=operation_name
7946                        )
7947                    else:
7948                        log.error(
7949                            f"Operations config: Type '{operation_type}' NOT available"
7950                        )
7951                        raise ValueError(
7952                            f"Operations config: Type '{operation_type}' NOT available"
7953                        )
7954                else:
7955                    log.error(
7956                        f"Operations config: Calculation '{operation_name}' NOT available"
7957                    )
7958                    raise ValueError(
7959                        f"Operations config: Calculation '{operation_name}' NOT available"
7960                    )
7961
7962        # Explode INFOS fields into table fields
7963        if self.get_explode_infos():
7964            self.explode_infos(
7965                prefix=self.get_explode_infos_prefix(),
7966                fields=self.get_explode_infos_fields(),
7967                force=True,
7968            )

It takes a list of operations, and for each operation, it checks if it's a python or sql operation, and then calls the appropriate function

param json example: "calculation": { "NOMEN": { "options": { "hgvs_field": "hgvs" }, "middle" : null }

def calculation_process_sql(self, operation: dict, operation_name: str = 'unknown') -> None:
7970    def calculation_process_sql(
7971        self, operation: dict, operation_name: str = "unknown"
7972    ) -> None:
7973        """
7974        The `calculation_process_sql` function takes in a mathematical operation as a string and
7975        performs the operation, updating the specified table with the result.
7976
7977        :param operation: The `operation` parameter is a dictionary that contains information about the
7978        mathematical operation to be performed. It includes the following keys:
7979        :type operation: dict
7980        :param operation_name: The `operation_name` parameter is a string that represents the name of
7981        the mathematical operation being performed. It is used for logging and error handling purposes,
7982        defaults to unknown
7983        :type operation_name: str (optional)
7984        """
7985
7986        # table variants
7987        table_variants = self.get_table_variants(clause="alter")
7988
7989        # Operation infos
7990        operation_name = operation.get("name", "unknown")
7991        log.debug(f"process sql {operation_name}")
7992        output_column_name = operation.get("output_column_name", operation_name)
7993        output_column_type = operation.get("output_column_type", "String")
7994        prefix = operation.get("explode_infos_prefix", "")
7995        output_column_type_sql = code_type_map_to_sql.get(output_column_type, "VARCHAR")
7996        output_column_description = operation.get(
7997            "output_column_description", f"{operation_name} operation"
7998        )
7999        operation_query = operation.get("operation_query", None)
8000        if isinstance(operation_query, list):
8001            operation_query = " ".join(operation_query)
8002        operation_info_fields = operation.get("info_fields", [])
8003        operation_info_fields_check = operation.get("info_fields_check", False)
8004        operation_info = operation.get("operation_info", True)
8005
8006        if operation_query:
8007
8008            # Info fields check
8009            operation_info_fields_check_result = True
8010            if operation_info_fields_check:
8011                header_infos = self.get_header().infos
8012                for info_field in operation_info_fields:
8013                    operation_info_fields_check_result = (
8014                        operation_info_fields_check_result
8015                        and info_field in header_infos
8016                    )
8017
8018            # If info fields available
8019            if operation_info_fields_check_result:
8020
8021                # Added_columns
8022                added_columns = []
8023
8024                # Create VCF header field
8025                vcf_reader = self.get_header()
8026                vcf_reader.infos[output_column_name] = vcf.parser._Info(
8027                    output_column_name,
8028                    ".",
8029                    output_column_type,
8030                    output_column_description,
8031                    "howard calculation",
8032                    "0",
8033                    self.code_type_map.get(output_column_type),
8034                )
8035
8036                # Explode infos if needed
8037                log.debug(f"calculation_process_sql prefix {prefix}")
8038                added_columns += self.explode_infos(
8039                    prefix=prefix,
8040                    fields=[output_column_name] + operation_info_fields,
8041                    force=True,
8042                )
8043
8044                # Create column
8045                added_column = self.add_column(
8046                    table_name=table_variants,
8047                    column_name=prefix + output_column_name,
8048                    column_type=output_column_type_sql,
8049                    default_value="null",
8050                )
8051                added_columns.append(added_column)
8052
8053                # Operation calculation
8054                try:
8055
8056                    # Query to update calculation column
8057                    sql_update = f"""
8058                        UPDATE {table_variants}
8059                        SET "{prefix}{output_column_name}" = ({operation_query})
8060                    """
8061                    self.conn.execute(sql_update)
8062
8063                    # Add to INFO
8064                    if operation_info:
8065                        sql_update_info = f"""
8066                            UPDATE {table_variants}
8067                            SET "INFO" =
8068                                concat(
8069                                    CASE
8070                                        WHEN "INFO" IS NOT NULL
8071                                        THEN concat("INFO", ';')
8072                                        ELSE ''
8073                                    END,
8074                                    '{output_column_name}=',
8075                                    "{prefix}{output_column_name}"
8076                                )
8077                            WHERE "{prefix}{output_column_name}" IS NOT NULL AND "{prefix}{output_column_name}" NOT IN ('')
8078                        """
8079                        self.conn.execute(sql_update_info)
8080
8081                except:
8082                    log.error(
8083                        f"Operations config: Calculation '{operation_name}' query failed"
8084                    )
8085                    raise ValueError(
8086                        f"Operations config: Calculation '{operation_name}' query failed"
8087                    )
8088
8089                # Remove added columns
8090                for added_column in added_columns:
8091                    log.debug(f"added_column: {added_column}")
8092                    self.drop_column(column=added_column)
8093
8094            else:
8095                log.error(
8096                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8097                )
8098                raise ValueError(
8099                    f"Operations config: Calculation '{operation_name}' DOES NOT contain all mandatory fields {operation_info_fields}"
8100                )
8101
8102        else:
8103            log.error(
8104                f"Operations config: Calculation '{operation_name}' query NOT defined"
8105            )
8106            raise ValueError(
8107                f"Operations config: Calculation '{operation_name}' query NOT defined"
8108            )

The calculation_process_sql function takes in a mathematical operation as a string and performs the operation, updating the specified table with the result.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the mathematical operation to be performed. It includes the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the mathematical operation being performed. It is used for logging and error handling purposes, defaults to unknown
def calculation_process_function(self, operation: dict, operation_name: str = 'unknown') -> None:
8110    def calculation_process_function(
8111        self, operation: dict, operation_name: str = "unknown"
8112    ) -> None:
8113        """
8114        The `calculation_process_function` takes in an operation dictionary and performs the specified
8115        function with the given parameters.
8116
8117        :param operation: The `operation` parameter is a dictionary that contains information about the
8118        operation to be performed. It has the following keys:
8119        :type operation: dict
8120        :param operation_name: The `operation_name` parameter is a string that represents the name of
8121        the operation being performed. It is used for logging purposes, defaults to unknown
8122        :type operation_name: str (optional)
8123        """
8124
8125        operation_name = operation["name"]
8126        log.debug(f"process sql {operation_name}")
8127        function_name = operation["function_name"]
8128        function_params = operation["function_params"]
8129        getattr(self, function_name)(*function_params)

The calculation_process_function takes in an operation dictionary and performs the specified function with the given parameters.

Parameters
  • operation: The operation parameter is a dictionary that contains information about the operation to be performed. It has the following keys:
  • operation_name: The operation_name parameter is a string that represents the name of the operation being performed. It is used for logging purposes, defaults to unknown
def calculation_variant_id(self) -> None:
8131    def calculation_variant_id(self) -> None:
8132        """
8133        The function `calculation_variant_id` adds a variant ID annotation to a VCF file header and
8134        updates the INFO field of a variants table with the variant ID.
8135        """
8136
8137        # variant_id annotation field
8138        variant_id_tag = self.get_variant_id_column()
8139        added_columns = [variant_id_tag]
8140
8141        # variant_id hgvs tags"
8142        vcf_infos_tags = {
8143            variant_id_tag: "howard variant ID annotation",
8144        }
8145
8146        # Variants table
8147        table_variants = self.get_table_variants()
8148
8149        # Header
8150        vcf_reader = self.get_header()
8151
8152        # Add variant_id to header
8153        vcf_reader.infos[variant_id_tag] = vcf.parser._Info(
8154            variant_id_tag,
8155            ".",
8156            "String",
8157            vcf_infos_tags.get(variant_id_tag, "howard variant ID annotation"),
8158            "howard calculation",
8159            "0",
8160            self.code_type_map.get("String"),
8161        )
8162
8163        # Update
8164        sql_update = f"""
8165            UPDATE {table_variants}
8166            SET "INFO" = 
8167                concat(
8168                    CASE
8169                        WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8170                        THEN ''
8171                        ELSE concat("INFO", ';')
8172                    END,
8173                    '{variant_id_tag}=',
8174                    "{variant_id_tag}"
8175                )
8176        """
8177        self.conn.execute(sql_update)
8178
8179        # Remove added columns
8180        for added_column in added_columns:
8181            self.drop_column(column=added_column)

The function calculation_variant_id adds a variant ID annotation to a VCF file header and updates the INFO field of a variants table with the variant ID.

def calculation_extract_snpeff_hgvs( self, snpeff_hgvs: str = 'snpeff_hgvs', snpeff_field: str = 'ANN') -> None:
8183    def calculation_extract_snpeff_hgvs(
8184        self,
8185        snpeff_hgvs: str = "snpeff_hgvs",
8186        snpeff_field: str = "ANN",
8187    ) -> None:
8188        """
8189        The function `calculation_extract_snpeff_hgvs` extracts HGVS nomenclatures from the SnpEff
8190        annotation field in a VCF file and adds them as a new column in the variants table.
8191
8192        :param snpeff_hgvs: The `snpeff_hgvs` parameter in the `calculation_extract_snpeff_hgvs`
8193        function is used to specify the name of the column that will store the HGVS nomenclatures
8194        extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to
8195        snpeff_hgvs
8196        :type snpeff_hgvs: str (optional)
8197        :param snpeff_field: The `snpeff_field` parameter in the `calculation_extract_snpeff_hgvs`
8198        function represents the field in the VCF file that contains SnpEff annotations. This field is
8199        used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults
8200        to ANN
8201        :type snpeff_field: str (optional)
8202        """
8203
8204        # Snpeff hgvs tags
8205        vcf_infos_tags = {
8206            snpeff_hgvs: "HGVS nomenclatures from snpEff annotation",
8207        }
8208
8209        # Prefix
8210        prefix = self.get_explode_infos_prefix()
8211        if prefix:
8212            prefix = "INFO/"
8213
8214        # snpEff fields
8215        speff_ann_infos = prefix + snpeff_field
8216        speff_hgvs_infos = prefix + snpeff_hgvs
8217
8218        # Variants table
8219        table_variants = self.get_table_variants()
8220
8221        # Header
8222        vcf_reader = self.get_header()
8223
8224        # Add columns
8225        added_columns = []
8226
8227        # Explode HGVS field in column
8228        added_columns += self.explode_infos(fields=[snpeff_field])
8229
8230        if snpeff_field in vcf_reader.infos:
8231
8232            log.debug(vcf_reader.infos[snpeff_field])
8233
8234            # Extract ANN header
8235            ann_description = vcf_reader.infos[snpeff_field].desc
8236            pattern = r"'(.+?)'"
8237            match = re.search(pattern, ann_description)
8238            if match:
8239                ann_header_match = match.group(1).split(" | ")
8240                ann_header_desc = {}
8241                for i in range(len(ann_header_match)):
8242                    ann_header_info = "".join(
8243                        char for char in ann_header_match[i] if char.isalnum()
8244                    )
8245                    ann_header_desc[ann_header_info] = ann_header_match[i]
8246                if not ann_header_desc:
8247                    raise ValueError("Invalid header description format")
8248            else:
8249                raise ValueError("Invalid header description format")
8250
8251            # Create variant id
8252            variant_id_column = self.get_variant_id_column()
8253            added_columns += [variant_id_column]
8254
8255            # Create dataframe
8256            dataframe_snpeff_hgvs = self.get_query_to_df(
8257                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8258            )
8259
8260            # Create main NOMEN column
8261            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8262                speff_ann_infos
8263            ].apply(
8264                lambda x: extract_snpeff_hgvs(
8265                    str(x), header=list(ann_header_desc.values())
8266                )
8267            )
8268
8269            # Add snpeff_hgvs to header
8270            vcf_reader.infos[snpeff_hgvs] = vcf.parser._Info(
8271                snpeff_hgvs,
8272                ".",
8273                "String",
8274                vcf_infos_tags.get(snpeff_hgvs, "snpEff hgvs annotations"),
8275                "howard calculation",
8276                "0",
8277                self.code_type_map.get("String"),
8278            )
8279
8280            # Update
8281            sql_update = f"""
8282                UPDATE variants
8283                SET "INFO" = 
8284                    concat(
8285                        CASE
8286                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8287                            THEN ''
8288                            ELSE concat("INFO", ';')
8289                        END,
8290                        CASE 
8291                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8292                            AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8293                            THEN concat(
8294                                    '{snpeff_hgvs}=',
8295                                    dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8296                                )
8297                            ELSE ''
8298                        END
8299                    )
8300                FROM dataframe_snpeff_hgvs
8301                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8302
8303            """
8304            self.conn.execute(sql_update)
8305
8306            # Delete dataframe
8307            del dataframe_snpeff_hgvs
8308            gc.collect()
8309
8310        else:
8311
8312            log.warning(
8313                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8314            )
8315
8316        # Remove added columns
8317        for added_column in added_columns:
8318            self.drop_column(column=added_column)

The function calculation_extract_snpeff_hgvs extracts HGVS nomenclatures from the SnpEff annotation field in a VCF file and adds them as a new column in the variants table.

Parameters
  • snpeff_hgvs: The snpeff_hgvs parameter in the calculation_extract_snpeff_hgvs function is used to specify the name of the column that will store the HGVS nomenclatures extracted from the SnpEff annotation field in a VCF file. This parameter allows you, defaults to snpeff_hgvs
  • snpeff_field: The snpeff_field parameter in the calculation_extract_snpeff_hgvs function represents the field in the VCF file that contains SnpEff annotations. This field is used to extract HGVS nomenclatures from the SnpEff annotation field and add them as a, defaults to ANN
def calculation_snpeff_ann_explode( self, uniquify: bool = True, output_format: str = 'fields', output_prefix: str = 'snpeff_', snpeff_field: str = 'ANN') -> None:
8320    def calculation_snpeff_ann_explode(
8321        self,
8322        uniquify: bool = True,
8323        output_format: str = "fields",
8324        output_prefix: str = "snpeff_",
8325        snpeff_field: str = "ANN",
8326    ) -> None:
8327        """
8328        The `calculation_snpeff_ann_explode` function processes SnpEff annotations in a VCF file by
8329        exploding the HGVS field and updating variant information accordingly.
8330
8331        :param uniquify: The `uniquify` parameter in the `calculation_snpeff_ann_explode` method is a
8332        boolean flag that determines whether the output should be uniquified or not. When set to `True`,
8333        it indicates that the output should be unique, meaning that duplicate entries should be removed,
8334        defaults to True
8335        :type uniquify: bool (optional)
8336        :param output_format: The `output_format` parameter in the `calculation_snpeff_ann_explode`
8337        function specifies the format in which the output annotations will be generated. It has a
8338        default value of "fields". You can also set it to "JSON" to output the annotations in JSON
8339        format, defaults to fields
8340        :type output_format: str (optional)
8341        :param output_prefix: The `output_prefix` parameter in the `calculation_snpeff_ann_explode`
8342        method is used to specify the prefix that will be added to the output annotations generated
8343        during the calculation process. This prefix helps to differentiate the newly added annotations
8344        from existing ones in the output data. By default, the, defaults to ANN_
8345        :type output_prefix: str (optional)
8346        :param snpeff_field: The `snpeff_field` parameter in the `calculation_snpeff_ann_explode`
8347        function is used to specify the field in the VCF file that contains SnpEff annotations. This
8348        field will be processed to explode the HGVS annotations and update the variant information
8349        accordingly, defaults to ANN
8350        :type snpeff_field: str (optional)
8351        """
8352
8353        # SnpEff annotation field
8354        snpeff_hgvs = "snpeff_ann_explode"
8355
8356        # Snpeff hgvs tags
8357        vcf_infos_tags = {
8358            snpeff_hgvs: "Explode snpEff annotations",
8359        }
8360
8361        # Prefix
8362        prefix = self.get_explode_infos_prefix()
8363        if prefix:
8364            prefix = "INFO/"
8365
8366        # snpEff fields
8367        speff_ann_infos = prefix + snpeff_field
8368        speff_hgvs_infos = prefix + snpeff_hgvs
8369
8370        # Variants table
8371        table_variants = self.get_table_variants()
8372
8373        # Header
8374        vcf_reader = self.get_header()
8375
8376        # Add columns
8377        added_columns = []
8378
8379        # Explode HGVS field in column
8380        added_columns += self.explode_infos(fields=[snpeff_field])
8381        log.debug(f"snpeff_field={snpeff_field}")
8382        log.debug(f"added_columns={added_columns}")
8383
8384        if snpeff_field in vcf_reader.infos:
8385
8386            # Extract ANN header
8387            ann_description = vcf_reader.infos[snpeff_field].desc
8388            pattern = r"'(.+?)'"
8389            match = re.search(pattern, ann_description)
8390            if match:
8391                ann_header_match = match.group(1).split(" | ")
8392                ann_header = []
8393                ann_header_desc = {}
8394                for i in range(len(ann_header_match)):
8395                    ann_header_info = "".join(
8396                        char for char in ann_header_match[i] if char.isalnum()
8397                    )
8398                    ann_header.append(ann_header_info)
8399                    ann_header_desc[ann_header_info] = ann_header_match[i]
8400                if not ann_header_desc:
8401                    raise ValueError("Invalid header description format")
8402            else:
8403                raise ValueError("Invalid header description format")
8404
8405            # Create variant id
8406            variant_id_column = self.get_variant_id_column()
8407            added_columns += [variant_id_column]
8408
8409            # Create dataframe
8410            dataframe_snpeff_hgvs = self.get_query_to_df(
8411                f""" SELECT "{variant_id_column}", "{speff_ann_infos}" FROM {table_variants} """
8412            )
8413
8414            # Create snpEff columns
8415            dataframe_snpeff_hgvs[speff_hgvs_infos] = dataframe_snpeff_hgvs[
8416                speff_ann_infos
8417            ].apply(
8418                lambda x: explode_snpeff_ann(
8419                    str(x),
8420                    uniquify=uniquify,
8421                    output_format=output_format,
8422                    prefix=output_prefix,
8423                    header=list(ann_header_desc.values()),
8424                )
8425            )
8426
8427            # Header
8428            ann_annotations_prefix = ""
8429            if output_format.upper() in ["JSON"]:
8430                ann_annotations_prefix = f"{output_prefix}="
8431                vcf_reader.infos[output_prefix] = vcf.parser._Info(
8432                    output_prefix,
8433                    ".",
8434                    "String",
8435                    vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8436                    + " - JSON format",
8437                    "howard calculation",
8438                    "0",
8439                    self.code_type_map.get("String"),
8440                )
8441            else:
8442                for ann_annotation in ann_header:
8443                    ann_annotation_id = f"{output_prefix}{ann_annotation}"
8444                    vcf_reader.infos[ann_annotation_id] = vcf.parser._Info(
8445                        ann_annotation_id,
8446                        ".",
8447                        "String",
8448                        vcf_infos_tags.get(snpeff_hgvs, "snpEff annotations")
8449                        + f" - '{ann_header_desc[ann_annotation]}' annotation",
8450                        "howard calculation",
8451                        "0",
8452                        self.code_type_map.get("String"),
8453                    )
8454
8455            # Update
8456            sql_update = f"""
8457                UPDATE variants
8458                SET "INFO" = 
8459                    concat(
8460                        CASE
8461                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8462                            THEN ''
8463                            ELSE concat("INFO", ';')
8464                        END,
8465                        CASE 
8466                            WHEN dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT IN ('','.','NaN')
8467                                AND dataframe_snpeff_hgvs."{speff_hgvs_infos}" NOT NULL
8468                            THEN concat(
8469                                '{ann_annotations_prefix}',
8470                                dataframe_snpeff_hgvs."{speff_hgvs_infos}"
8471                                )
8472                            ELSE ''
8473                        END
8474                    )
8475                FROM dataframe_snpeff_hgvs
8476                WHERE {table_variants}."{variant_id_column}" = dataframe_snpeff_hgvs."{variant_id_column}"
8477
8478            """
8479            self.conn.execute(sql_update)
8480
8481            # Delete dataframe
8482            del dataframe_snpeff_hgvs
8483            gc.collect()
8484
8485        else:
8486
8487            log.warning(
8488                "No snpEff annotation. Please Anotate with snpEff before use this calculation option"
8489            )
8490
8491        # Remove added columns
8492        for added_column in added_columns:
8493            self.drop_column(column=added_column)

The calculation_snpeff_ann_explode function processes SnpEff annotations in a VCF file by exploding the HGVS field and updating variant information accordingly.

Parameters
  • uniquify: The uniquify parameter in the calculation_snpeff_ann_explode method is a boolean flag that determines whether the output should be uniquified or not. When set to True, it indicates that the output should be unique, meaning that duplicate entries should be removed, defaults to True
  • output_format: The output_format parameter in the calculation_snpeff_ann_explode function specifies the format in which the output annotations will be generated. It has a default value of "fields". You can also set it to "JSON" to output the annotations in JSON format, defaults to fields
  • output_prefix: The output_prefix parameter in the calculation_snpeff_ann_explode method is used to specify the prefix that will be added to the output annotations generated during the calculation process. This prefix helps to differentiate the newly added annotations from existing ones in the output data. By default, the, defaults to ANN_
  • snpeff_field: The snpeff_field parameter in the calculation_snpeff_ann_explode function is used to specify the field in the VCF file that contains SnpEff annotations. This field will be processed to explode the HGVS annotations and update the variant information accordingly, defaults to ANN
def calculation_extract_nomen(self) -> None:
8495    def calculation_extract_nomen(self) -> None:
8496        """
8497        This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.
8498        """
8499
8500        # NOMEN field
8501        field_nomen_dict = "NOMEN_DICT"
8502
8503        # NOMEN structure
8504        nomen_dict = {
8505            "NOMEN": "NOMEN hgvs nomenclature considered as reference hgvs (official transcript, first otherwise)",
8506            "CNOMEN": "CNOMEN hgvs nomenclature at DNA level related to a transcript (TNOMEN)",
8507            "RNOMEN": "RNOMEN hgvs nomenclature at RNA level related to a transcript (TNOMEN)",
8508            "NNOMEN": "NNOMEN hgvs nomenclature for non-coding variant",
8509            "PNOMEN": "PNOMEN hgvs nomenclature at Protein level related to a transcript (TNOMEN)",
8510            "TVNOMEN": "TVNOMEN hgvs transcript with version (if any) used (e.g. for CNOMEN and PNOMEN)",
8511            "TNOMEN": "TNOMEN hgvs transcript used (e.g. for CNOMEN and PNOMEN)",
8512            "VNOMEN": "VNOMEN hgvs transcript version used (e.g. for CNOMEN and PNOMEN)",
8513            "ENOMEN": "ENOMEN hgvs exon nomenclature related to a transcript (TNOMEN)",
8514            "GNOMEN": "GNOMEN hgvs gene nomenclature related to a transcript (TNOMEN)",
8515        }
8516
8517        # Param
8518        param = self.get_param()
8519
8520        # Prefix
8521        prefix = self.get_explode_infos_prefix()
8522
8523        # Header
8524        vcf_reader = self.get_header()
8525
8526        # Get HGVS field
8527        hgvs_field = (
8528            param.get("calculation", {})
8529            .get("calculations", {})
8530            .get("NOMEN", {})
8531            .get("options", {})
8532            .get("hgvs_field", "hgvs")
8533        )
8534
8535        # Get transcripts
8536        transcripts_file = (
8537            param.get("calculation", {})
8538            .get("calculations", {})
8539            .get("NOMEN", {})
8540            .get("options", {})
8541            .get("transcripts", None)
8542        )
8543        transcripts_file = full_path(transcripts_file)
8544        transcripts = []
8545        if transcripts_file:
8546            if os.path.exists(transcripts_file):
8547                transcripts_dataframe = transcripts_file_to_df(transcripts_file)
8548                transcripts = transcripts_dataframe.iloc[:, 0].tolist()
8549            else:
8550                log.error(f"Transcript file '{transcripts_file}' does NOT exist")
8551                raise ValueError(f"Transcript file '{transcripts_file}' does NOT exist")
8552
8553        # Added columns
8554        added_columns = []
8555
8556        # Explode HGVS field in column
8557        added_columns += self.explode_infos(fields=[hgvs_field])
8558
8559        # extra infos
8560        extra_infos = self.get_extra_infos()
8561        extra_field = prefix + hgvs_field
8562
8563        if extra_field in extra_infos:
8564
8565            # Create dataframe
8566            dataframe_hgvs = self.get_query_to_df(
8567                f""" SELECT "#CHROM", "POS", "REF", "ALT", "{extra_field}" FROM variants """
8568            )
8569
8570            # Create main NOMEN column
8571            dataframe_hgvs[field_nomen_dict] = dataframe_hgvs[extra_field].apply(
8572                lambda x: find_nomen(str(x), transcripts=transcripts)
8573            )
8574
8575            # Explode NOMEN Structure and create SQL set for update
8576            sql_nomen_fields = []
8577            for nomen_field in nomen_dict:
8578
8579                # Explode each field into a column
8580                dataframe_hgvs[nomen_field] = dataframe_hgvs[field_nomen_dict].apply(
8581                    lambda x: dict(x).get(nomen_field, "")
8582                )
8583
8584                # Create VCF header field
8585                vcf_reader.infos[nomen_field] = vcf.parser._Info(
8586                    nomen_field,
8587                    ".",
8588                    "String",
8589                    nomen_dict.get(nomen_field, "howard calculation NOMEN"),
8590                    "howard calculation",
8591                    "0",
8592                    self.code_type_map.get("String"),
8593                )
8594                sql_nomen_fields.append(
8595                    f"""
8596                        CASE 
8597                            WHEN dataframe_hgvs."{nomen_field}" NOT NULL AND dataframe_hgvs."{nomen_field}" NOT IN ('')
8598                            THEN concat(
8599                                    ';{nomen_field}=',
8600                                    dataframe_hgvs."{nomen_field}"
8601                                )
8602                            ELSE ''
8603                        END
8604                    """
8605                )
8606
8607            # SQL set for update
8608            sql_nomen_fields_set = ", ".join(sql_nomen_fields)
8609
8610            # Update
8611            sql_update = f"""
8612                UPDATE variants
8613                SET "INFO" = 
8614                    concat(
8615                        CASE
8616                            WHEN "INFO" IS NULL
8617                            THEN ''
8618                            ELSE "INFO"
8619                        END,
8620                        {sql_nomen_fields_set}
8621                    )
8622                FROM dataframe_hgvs
8623                WHERE variants."#CHROM" = dataframe_hgvs."#CHROM"
8624                    AND variants."POS" = dataframe_hgvs."POS" 
8625                    AND variants."REF" = dataframe_hgvs."REF"
8626                    AND variants."ALT" = dataframe_hgvs."ALT"
8627            """
8628            self.conn.execute(sql_update)
8629
8630            # Delete dataframe
8631            del dataframe_hgvs
8632            gc.collect()
8633
8634        # Remove added columns
8635        for added_column in added_columns:
8636            self.drop_column(column=added_column)

This function extracts the HGVS nomenclature from the calculation/identification of NOMEN.

def calculation_find_by_pipeline(self, tag: str = 'findbypipeline') -> None:
8638    def calculation_find_by_pipeline(self, tag: str = "findbypipeline") -> None:
8639        """
8640        The function `calculation_find_by_pipeline` performs a calculation to find the number of
8641        pipeline/sample for a variant and updates the variant information in a VCF file.
8642
8643        :param tag: The `tag` parameter is a string that represents the annotation field for the
8644        "findbypipeline" information in the VCF file. It is used to create the annotation field in the
8645        VCF header and to update the corresponding field in the variants table, defaults to
8646        findbypipeline
8647        :type tag: str (optional)
8648        """
8649
8650        # if FORMAT and samples
8651        if (
8652            "FORMAT" in self.get_header_columns_as_list()
8653            and self.get_header_sample_list()
8654        ):
8655
8656            # findbypipeline annotation field
8657            findbypipeline_tag = tag
8658
8659            # VCF infos tags
8660            vcf_infos_tags = {
8661                findbypipeline_tag: f"Number of pipeline/sample for a variant ({findbypipeline_tag})",
8662            }
8663
8664            # Prefix
8665            prefix = self.get_explode_infos_prefix()
8666
8667            # Field
8668            findbypipeline_infos = prefix + findbypipeline_tag
8669
8670            # Variants table
8671            table_variants = self.get_table_variants()
8672
8673            # Header
8674            vcf_reader = self.get_header()
8675
8676            # Create variant id
8677            variant_id_column = self.get_variant_id_column()
8678            added_columns = [variant_id_column]
8679
8680            # variant_id, FORMAT and samples
8681            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8682                self.get_header_sample_list()
8683            )
8684
8685            # Create dataframe
8686            dataframe_findbypipeline = self.get_query_to_df(
8687                f""" SELECT {samples_fields} FROM {table_variants} """
8688            )
8689
8690            # Create findbypipeline column
8691            dataframe_findbypipeline[findbypipeline_infos] = (
8692                dataframe_findbypipeline.apply(
8693                    lambda row: findbypipeline(
8694                        row, samples=self.get_header_sample_list()
8695                    ),
8696                    axis=1,
8697                )
8698            )
8699
8700            # Add snpeff_hgvs to header
8701            vcf_reader.infos[findbypipeline_tag] = vcf.parser._Info(
8702                findbypipeline_tag,
8703                ".",
8704                "String",
8705                vcf_infos_tags.get(findbypipeline_tag, "Find in pipeline/sample"),
8706                "howard calculation",
8707                "0",
8708                self.code_type_map.get("String"),
8709            )
8710
8711            # Update
8712            sql_update = f"""
8713                UPDATE variants
8714                SET "INFO" = 
8715                    concat(
8716                        CASE
8717                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8718                            THEN ''
8719                            ELSE concat("INFO", ';')
8720                        END,
8721                        CASE 
8722                            WHEN dataframe_findbypipeline."{findbypipeline_infos}" NOT IN ('','.')
8723                                AND dataframe_findbypipeline."{findbypipeline_infos}" NOT NULL
8724                            THEN concat(
8725                                    '{findbypipeline_tag}=',
8726                                    dataframe_findbypipeline."{findbypipeline_infos}"
8727                                )
8728                            ELSE ''
8729                        END
8730                    )
8731                FROM dataframe_findbypipeline
8732                WHERE variants."{variant_id_column}" = dataframe_findbypipeline."{variant_id_column}"
8733            """
8734            self.conn.execute(sql_update)
8735
8736            # Remove added columns
8737            for added_column in added_columns:
8738                self.drop_column(column=added_column)
8739
8740            # Delete dataframe
8741            del dataframe_findbypipeline
8742            gc.collect()

The function calculation_find_by_pipeline performs a calculation to find the number of pipeline/sample for a variant and updates the variant information in a VCF file.

Parameters
  • tag: The tag parameter is a string that represents the annotation field for the "findbypipeline" information in the VCF file. It is used to create the annotation field in the VCF header and to update the corresponding field in the variants table, defaults to findbypipeline
def calculation_genotype_concordance(self) -> None:
8744    def calculation_genotype_concordance(self) -> None:
8745        """
8746        The function `calculation_genotype_concordance` calculates the genotype concordance for
8747        multi-caller VCF files and updates the variant information in the database.
8748        """
8749
8750        # if FORMAT and samples
8751        if (
8752            "FORMAT" in self.get_header_columns_as_list()
8753            and self.get_header_sample_list()
8754        ):
8755
8756            # genotypeconcordance annotation field
8757            genotypeconcordance_tag = "genotypeconcordance"
8758
8759            # VCF infos tags
8760            vcf_infos_tags = {
8761                genotypeconcordance_tag: "Concordance of genotype for multi caller VCF",
8762            }
8763
8764            # Prefix
8765            prefix = self.get_explode_infos_prefix()
8766
8767            # Field
8768            genotypeconcordance_infos = prefix + genotypeconcordance_tag
8769
8770            # Variants table
8771            table_variants = self.get_table_variants()
8772
8773            # Header
8774            vcf_reader = self.get_header()
8775
8776            # Create variant id
8777            variant_id_column = self.get_variant_id_column()
8778            added_columns = [variant_id_column]
8779
8780            # variant_id, FORMAT and samples
8781            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8782                self.get_header_sample_list()
8783            )
8784
8785            # Create dataframe
8786            dataframe_genotypeconcordance = self.get_query_to_df(
8787                f""" SELECT {samples_fields} FROM {table_variants} """
8788            )
8789
8790            # Create genotypeconcordance column
8791            dataframe_genotypeconcordance[genotypeconcordance_infos] = (
8792                dataframe_genotypeconcordance.apply(
8793                    lambda row: genotypeconcordance(
8794                        row, samples=self.get_header_sample_list()
8795                    ),
8796                    axis=1,
8797                )
8798            )
8799
8800            # Add genotypeconcordance to header
8801            vcf_reader.infos[genotypeconcordance_tag] = vcf.parser._Info(
8802                genotypeconcordance_tag,
8803                ".",
8804                "String",
8805                vcf_infos_tags.get(genotypeconcordance_tag, "snpEff hgvs annotations"),
8806                "howard calculation",
8807                "0",
8808                self.code_type_map.get("String"),
8809            )
8810
8811            # Update
8812            sql_update = f"""
8813                UPDATE variants
8814                SET "INFO" = 
8815                    concat(
8816                        CASE
8817                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8818                            THEN ''
8819                            ELSE concat("INFO", ';')
8820                        END,
8821                        CASE
8822                            WHEN dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT IN ('','.')
8823                                AND dataframe_genotypeconcordance."{genotypeconcordance_infos}" NOT NULL
8824                            THEN concat(
8825                                    '{genotypeconcordance_tag}=',
8826                                    dataframe_genotypeconcordance."{genotypeconcordance_infos}"
8827                                )
8828                            ELSE ''
8829                        END
8830                    )
8831                FROM dataframe_genotypeconcordance
8832                WHERE variants."{variant_id_column}" = dataframe_genotypeconcordance."{variant_id_column}"
8833            """
8834            self.conn.execute(sql_update)
8835
8836            # Remove added columns
8837            for added_column in added_columns:
8838                self.drop_column(column=added_column)
8839
8840            # Delete dataframe
8841            del dataframe_genotypeconcordance
8842            gc.collect()

The function calculation_genotype_concordance calculates the genotype concordance for multi-caller VCF files and updates the variant information in the database.

def calculation_barcode(self, tag: str = 'barcode') -> None:
8844    def calculation_barcode(self, tag: str = "barcode") -> None:
8845        """
8846        The `calculation_barcode` function calculates barcode values for variants in a VCF file and
8847        updates the INFO field in the file with the calculated barcode values.
8848
8849        :param tag: The `tag` parameter in the `calculation_barcode` function is used to specify the tag
8850        name that will be used for the barcode calculation in the VCF file. If no tag name is provided,
8851        the default tag name is set to "barcode", defaults to barcode
8852        :type tag: str (optional)
8853        """
8854
8855        # if FORMAT and samples
8856        if (
8857            "FORMAT" in self.get_header_columns_as_list()
8858            and self.get_header_sample_list()
8859        ):
8860
8861            # barcode annotation field
8862            if not tag:
8863                tag = "barcode"
8864
8865            # VCF infos tags
8866            vcf_infos_tags = {
8867                tag: "barcode calculation (VaRank)",
8868            }
8869
8870            # Prefix
8871            prefix = self.get_explode_infos_prefix()
8872
8873            # Field
8874            barcode_infos = prefix + tag
8875
8876            # Variants table
8877            table_variants = self.get_table_variants()
8878
8879            # Header
8880            vcf_reader = self.get_header()
8881
8882            # Create variant id
8883            variant_id_column = self.get_variant_id_column()
8884            added_columns = [variant_id_column]
8885
8886            # variant_id, FORMAT and samples
8887            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
8888                self.get_header_sample_list()
8889            )
8890
8891            # Create dataframe
8892            dataframe_barcode = self.get_query_to_df(
8893                f""" SELECT {samples_fields} FROM {table_variants} """
8894            )
8895
8896            # Create barcode column
8897            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
8898                lambda row: barcode(row, samples=self.get_header_sample_list()), axis=1
8899            )
8900
8901            # Add barcode to header
8902            vcf_reader.infos[tag] = vcf.parser._Info(
8903                tag,
8904                ".",
8905                "String",
8906                vcf_infos_tags.get(tag, vcf_infos_tags.get(tag)),
8907                "howard calculation",
8908                "0",
8909                self.code_type_map.get("String"),
8910            )
8911
8912            # Update
8913            sql_update = f"""
8914                UPDATE {table_variants}
8915                SET "INFO" = 
8916                    concat(
8917                        CASE
8918                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
8919                            THEN ''
8920                            ELSE concat("INFO", ';')
8921                        END,
8922                        CASE
8923                            WHEN dataframe_barcode."{barcode_infos}" NOT IN ('','.')
8924                            AND dataframe_barcode."{barcode_infos}" NOT NULL
8925                            THEN concat(
8926                                    '{tag}=',
8927                                    dataframe_barcode."{barcode_infos}"
8928                                )
8929                            ELSE ''
8930                        END
8931                    )
8932                FROM dataframe_barcode
8933                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
8934            """
8935            self.conn.execute(sql_update)
8936
8937            # Remove added columns
8938            for added_column in added_columns:
8939                self.drop_column(column=added_column)
8940
8941            # Delete dataframe
8942            del dataframe_barcode
8943            gc.collect()

The calculation_barcode function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode function is used to specify the tag name that will be used for the barcode calculation in the VCF file. If no tag name is provided, the default tag name is set to "barcode", defaults to barcode
def calculation_barcode_family(self, tag: str = 'BCF') -> None:
8945    def calculation_barcode_family(self, tag: str = "BCF") -> None:
8946        """
8947        The `calculation_barcode_family` function calculates barcode values for variants in a VCF file
8948        and updates the INFO field in the file with the calculated barcode values.
8949
8950        :param tag: The `tag` parameter in the `calculation_barcode_family` function is used to specify
8951        the barcode tag that will be added to the VCF file during the calculation process. If no value
8952        is provided for the `tag` parameter, the default value used is "BCF", defaults to BCF
8953        :type tag: str (optional)
8954        """
8955
8956        # if FORMAT and samples
8957        if (
8958            "FORMAT" in self.get_header_columns_as_list()
8959            and self.get_header_sample_list()
8960        ):
8961
8962            # barcode annotation field
8963            if not tag:
8964                tag = "BCF"
8965
8966            # VCF infos tags
8967            vcf_infos_tags = {
8968                tag: "barcode family calculation",
8969                f"{tag}S": "barcode family samples",
8970            }
8971
8972            # Param
8973            param = self.get_param()
8974            log.debug(f"param={param}")
8975
8976            # Prefix
8977            prefix = self.get_explode_infos_prefix()
8978
8979            # PED param
8980            ped = (
8981                param.get("calculation", {})
8982                .get("calculations", {})
8983                .get("BARCODEFAMILY", {})
8984                .get("family_pedigree", None)
8985            )
8986            log.debug(f"ped={ped}")
8987
8988            # Load PED
8989            if ped:
8990
8991                # Pedigree is a file
8992                if isinstance(ped, str) and os.path.exists(full_path(ped)):
8993                    log.debug("Pedigree is file")
8994                    with open(full_path(ped)) as ped:
8995                        ped = json.load(ped)
8996
8997                # Pedigree is a string
8998                elif isinstance(ped, str):
8999                    log.debug("Pedigree is str")
9000                    try:
9001                        ped = json.loads(ped)
9002                        log.debug("Pedigree is json str")
9003                    except ValueError as e:
9004                        ped_samples = ped.split(",")
9005                        ped = {}
9006                        for ped_sample in ped_samples:
9007                            ped[ped_sample] = ped_sample
9008
9009                # Pedigree is a dict
9010                elif isinstance(ped, dict):
9011                    log.debug("Pedigree is dict")
9012
9013                # Pedigree is not well formatted
9014                else:
9015                    msg_error = "Pedigree not well formatted"
9016                    log.error(msg_error)
9017                    raise ValueError(msg_error)
9018
9019                # Construct list
9020                ped_samples = list(ped.values())
9021
9022            else:
9023                log.debug("Pedigree not defined. Take all samples")
9024                ped_samples = self.get_header_sample_list()
9025                ped = {}
9026                for ped_sample in ped_samples:
9027                    ped[ped_sample] = ped_sample
9028
9029            # Check pedigree
9030            if not ped or len(ped) == 0:
9031                msg_error = f"Error in pedigree: samples {ped_samples}"
9032                log.error(msg_error)
9033                raise ValueError(msg_error)
9034
9035            # Log
9036            log.info(
9037                "Calculation 'BARCODEFAMILY' - Samples: "
9038                + ", ".join([f"{member}='{ped[member]}'" for member in ped])
9039            )
9040            log.debug(f"ped_samples={ped_samples}")
9041
9042            # Field
9043            barcode_infos = prefix + tag
9044
9045            # Variants table
9046            table_variants = self.get_table_variants()
9047
9048            # Header
9049            vcf_reader = self.get_header()
9050
9051            # Create variant id
9052            variant_id_column = self.get_variant_id_column()
9053            added_columns = [variant_id_column]
9054
9055            # variant_id, FORMAT and samples
9056            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9057                ped_samples
9058            )
9059
9060            # Create dataframe
9061            dataframe_barcode = self.get_query_to_df(
9062                f""" SELECT {samples_fields} FROM {table_variants} """
9063            )
9064
9065            # Create barcode column
9066            dataframe_barcode[barcode_infos] = dataframe_barcode.apply(
9067                lambda row: barcode(row, samples=ped_samples), axis=1
9068            )
9069
9070            # Add barcode family to header
9071            # Add vaf_normalization to header
9072            vcf_reader.formats[tag] = vcf.parser._Format(
9073                id=tag,
9074                num=".",
9075                type="String",
9076                desc=vcf_infos_tags.get(tag, "barcode family calculation"),
9077                type_code=self.code_type_map.get("String"),
9078            )
9079            vcf_reader.formats[f"{tag}S"] = vcf.parser._Format(
9080                id=f"{tag}S",
9081                num=".",
9082                type="String",
9083                desc=vcf_infos_tags.get(f"{tag}S", "barcode family samples"),
9084                type_code=self.code_type_map.get("String"),
9085            )
9086
9087            # Update
9088            # for sample in ped_samples:
9089            sql_update_set = []
9090            for sample in self.get_header_sample_list() + ["FORMAT"]:
9091                if sample in ped_samples:
9092                    value = f'dataframe_barcode."{barcode_infos}"'
9093                    value_samples = "'" + ",".join(ped_samples) + "'"
9094                elif sample == "FORMAT":
9095                    value = f"'{tag}'"
9096                    value_samples = f"'{tag}S'"
9097                else:
9098                    value = "'.'"
9099                    value_samples = "'.'"
9100                format_regex = r"[a-zA-Z0-9\s]"
9101                sql_update_set.append(
9102                    f"""
9103                        "{sample}" = 
9104                        concat(
9105                            CASE
9106                                WHEN {table_variants}."{sample}" = './.'
9107                                THEN concat('./.',regexp_replace(regexp_replace({table_variants}.FORMAT, '{format_regex}', '', 'g'), ':', ':.', 'g'))
9108                                ELSE {table_variants}."{sample}"
9109                            END,
9110                            ':',
9111                            {value},
9112                            ':',
9113                            {value_samples}
9114                        )
9115                    """
9116                )
9117
9118            sql_update_set_join = ", ".join(sql_update_set)
9119            sql_update = f"""
9120                UPDATE {table_variants}
9121                SET {sql_update_set_join}
9122                FROM dataframe_barcode
9123                WHERE {table_variants}."{variant_id_column}" = dataframe_barcode."{variant_id_column}"
9124            """
9125            self.conn.execute(sql_update)
9126
9127            # Remove added columns
9128            for added_column in added_columns:
9129                self.drop_column(column=added_column)
9130
9131            # Delete dataframe
9132            del dataframe_barcode
9133            gc.collect()

The calculation_barcode_family function calculates barcode values for variants in a VCF file and updates the INFO field in the file with the calculated barcode values.

Parameters
  • tag: The tag parameter in the calculation_barcode_family function is used to specify the barcode tag that will be added to the VCF file during the calculation process. If no value is provided for the tag parameter, the default value used is "BCF", defaults to BCF
def calculation_trio(self) -> None:
9135    def calculation_trio(self) -> None:
9136        """
9137        The `calculation_trio` function performs trio calculations on a VCF file by adding trio
9138        information to the INFO field of each variant.
9139        """
9140
9141        # if FORMAT and samples
9142        if (
9143            "FORMAT" in self.get_header_columns_as_list()
9144            and self.get_header_sample_list()
9145        ):
9146
9147            # trio annotation field
9148            trio_tag = "trio"
9149
9150            # VCF infos tags
9151            vcf_infos_tags = {
9152                "trio": "trio calculation",
9153            }
9154
9155            # Param
9156            param = self.get_param()
9157
9158            # Prefix
9159            prefix = self.get_explode_infos_prefix()
9160
9161            # Trio param
9162            trio_ped = (
9163                param.get("calculation", {})
9164                .get("calculations", {})
9165                .get("TRIO", {})
9166                .get("trio_pedigree", None)
9167            )
9168
9169            # Load trio
9170            if trio_ped:
9171
9172                # Trio pedigree is a file
9173                if isinstance(trio_ped, str) and os.path.exists(full_path(trio_ped)):
9174                    log.debug("TRIO pedigree is file")
9175                    with open(full_path(trio_ped)) as trio_ped:
9176                        trio_ped = json.load(trio_ped)
9177
9178                # Trio pedigree is a string
9179                elif isinstance(trio_ped, str):
9180                    log.debug("TRIO pedigree is str")
9181                    try:
9182                        trio_ped = json.loads(trio_ped)
9183                        log.debug("TRIO pedigree is json str")
9184                    except ValueError as e:
9185                        trio_samples = trio_ped.split(",")
9186                        if len(trio_samples) == 3:
9187                            trio_ped = {
9188                                "father": trio_samples[0],
9189                                "mother": trio_samples[1],
9190                                "child": trio_samples[2],
9191                            }
9192                            log.debug("TRIO pedigree is list str")
9193                        else:
9194                            msg_error = "TRIO pedigree not well formatted"
9195                            log.error(msg_error)
9196                            raise ValueError(msg_error)
9197
9198                # Trio pedigree is a dict
9199                elif isinstance(trio_ped, dict):
9200                    log.debug("TRIO pedigree is dict")
9201
9202                # Trio pedigree is not well formatted
9203                else:
9204                    msg_error = "TRIO pedigree not well formatted"
9205                    log.error(msg_error)
9206                    raise ValueError(msg_error)
9207
9208                # Construct trio list
9209                trio_samples = [
9210                    trio_ped.get("father", ""),
9211                    trio_ped.get("mother", ""),
9212                    trio_ped.get("child", ""),
9213                ]
9214
9215            else:
9216                log.debug("TRIO pedigree not defined. Take the first 3 samples")
9217                samples_list = self.get_header_sample_list()
9218                if len(samples_list) >= 3:
9219                    trio_samples = self.get_header_sample_list()[0:3]
9220                    trio_ped = {
9221                        "father": trio_samples[0],
9222                        "mother": trio_samples[1],
9223                        "child": trio_samples[2],
9224                    }
9225                else:
9226                    msg_error = f"Error in TRIO pedigree: only {len(samples_list)} samples {samples_list}"
9227                    log.error(msg_error)
9228                    raise ValueError(msg_error)
9229
9230            # Check trio pedigree
9231            if not trio_ped or len(trio_ped) != 3:
9232                msg_error = f"Error in TRIO pedigree: {trio_ped}"
9233                log.error(msg_error)
9234                raise ValueError(msg_error)
9235
9236            # Log
9237            log.info(
9238                f"Calculation 'TRIO' - Samples: "
9239                + ", ".join([f"{member}='{trio_ped[member]}'" for member in trio_ped])
9240            )
9241
9242            # Field
9243            trio_infos = prefix + trio_tag
9244
9245            # Variants table
9246            table_variants = self.get_table_variants()
9247
9248            # Header
9249            vcf_reader = self.get_header()
9250
9251            # Create variant id
9252            variant_id_column = self.get_variant_id_column()
9253            added_columns = [variant_id_column]
9254
9255            # variant_id, FORMAT and samples
9256            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9257                self.get_header_sample_list()
9258            )
9259
9260            # Create dataframe
9261            dataframe_trio = self.get_query_to_df(
9262                f""" SELECT {samples_fields} FROM {table_variants} """
9263            )
9264
9265            # Create trio column
9266            dataframe_trio[trio_infos] = dataframe_trio.apply(
9267                lambda row: trio(row, samples=trio_samples), axis=1
9268            )
9269
9270            # Add trio to header
9271            vcf_reader.infos[trio_tag] = vcf.parser._Info(
9272                trio_tag,
9273                ".",
9274                "String",
9275                vcf_infos_tags.get(trio_tag, "snpEff hgvs annotations"),
9276                "howard calculation",
9277                "0",
9278                self.code_type_map.get("String"),
9279            )
9280
9281            # Update
9282            sql_update = f"""
9283                UPDATE {table_variants}
9284                SET "INFO" = 
9285                    concat(
9286                        CASE
9287                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9288                            THEN ''
9289                            ELSE concat("INFO", ';')
9290                        END,
9291                        CASE
9292                            WHEN dataframe_trio."{trio_infos}" NOT IN ('','.')
9293                             AND dataframe_trio."{trio_infos}" NOT NULL
9294                            THEN concat(
9295                                    '{trio_tag}=',
9296                                    dataframe_trio."{trio_infos}"
9297                                )
9298                            ELSE ''
9299                        END
9300                    )
9301                FROM dataframe_trio
9302                WHERE {table_variants}."{variant_id_column}" = dataframe_trio."{variant_id_column}"
9303            """
9304            self.conn.execute(sql_update)
9305
9306            # Remove added columns
9307            for added_column in added_columns:
9308                self.drop_column(column=added_column)
9309
9310            # Delete dataframe
9311            del dataframe_trio
9312            gc.collect()

The calculation_trio function performs trio calculations on a VCF file by adding trio information to the INFO field of each variant.

def calculation_vaf_normalization(self) -> None:
9314    def calculation_vaf_normalization(self) -> None:
9315        """
9316        The `calculation_vaf_normalization` function calculates the VAF (Variant Allele Frequency)
9317        normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.
9318        :return: The function does not return anything.
9319        """
9320
9321        # if FORMAT and samples
9322        if (
9323            "FORMAT" in self.get_header_columns_as_list()
9324            and self.get_header_sample_list()
9325        ):
9326
9327            # vaf_normalization annotation field
9328            vaf_normalization_tag = "VAF"
9329
9330            # VCF infos tags
9331            vcf_infos_tags = {
9332                "VAF": "VAF Variant Frequency",
9333            }
9334
9335            # Prefix
9336            prefix = self.get_explode_infos_prefix()
9337
9338            # Variants table
9339            table_variants = self.get_table_variants()
9340
9341            # Header
9342            vcf_reader = self.get_header()
9343
9344            # Do not calculate if VAF already exists
9345            if "VAF" in vcf_reader.formats:
9346                log.debug("VAF already on genotypes")
9347                return
9348
9349            # Create variant id
9350            variant_id_column = self.get_variant_id_column()
9351            added_columns = [variant_id_column]
9352
9353            # variant_id, FORMAT and samples
9354            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9355                f""" "{sample}" """ for sample in self.get_header_sample_list()
9356            )
9357
9358            # Create dataframe
9359            query = f""" SELECT {variant_id_column}, FORMAT, {samples_fields} FROM {table_variants} """
9360            log.debug(f"query={query}")
9361            dataframe_vaf_normalization = self.get_query_to_df(query=query)
9362
9363            vaf_normalization_set = []
9364
9365            # for each sample vaf_normalization
9366            for sample in self.get_header_sample_list():
9367                dataframe_vaf_normalization[sample] = dataframe_vaf_normalization.apply(
9368                    lambda row: vaf_normalization(row, sample=sample), axis=1
9369                )
9370                vaf_normalization_set.append(
9371                    f""" "{sample}" = dataframe_vaf_normalization."{sample}" """
9372                )
9373
9374            # Add VAF to FORMAT
9375            dataframe_vaf_normalization["FORMAT"] = dataframe_vaf_normalization[
9376                "FORMAT"
9377            ].apply(lambda x: str(x) + ":VAF")
9378            vaf_normalization_set.append(
9379                f""" "FORMAT" = dataframe_vaf_normalization."FORMAT" """
9380            )
9381
9382            # Add vaf_normalization to header
9383            vcf_reader.formats[vaf_normalization_tag] = vcf.parser._Format(
9384                id=vaf_normalization_tag,
9385                num="1",
9386                type="Float",
9387                desc=vcf_infos_tags.get(vaf_normalization_tag, "VAF Variant Frequency"),
9388                type_code=self.code_type_map.get("Float"),
9389            )
9390
9391            # Create fields to add in INFO
9392            sql_vaf_normalization_set = " , ".join(vaf_normalization_set)
9393
9394            # Update
9395            sql_update = f"""
9396                UPDATE {table_variants}
9397                SET {sql_vaf_normalization_set}
9398                FROM dataframe_vaf_normalization
9399                WHERE variants."{variant_id_column}" = dataframe_vaf_normalization."{variant_id_column}"
9400
9401            """
9402            self.conn.execute(sql_update)
9403
9404            # Remove added columns
9405            for added_column in added_columns:
9406                self.drop_column(column=added_column)
9407
9408            # Delete dataframe
9409            del dataframe_vaf_normalization
9410            gc.collect()

The calculation_vaf_normalization function calculates the VAF (Variant Allele Frequency) normalization for each sample in a VCF file and updates the FORMAT and INFO fields accordingly.

Returns

The function does not return anything.

def calculation_genotype_stats(self, info: str = 'VAF') -> None:
9412    def calculation_genotype_stats(self, info: str = "VAF") -> None:
9413        """
9414        The `calculation_genotype_stats` function calculates genotype statistics for a given information
9415        field in a VCF file and updates the INFO column of the variants table with the calculated
9416        statistics.
9417
9418        :param info: The `info` parameter is a string that represents the type of information for which
9419        genotype statistics are calculated. It is used to generate various VCF info tags for the
9420        statistics, such as the number of occurrences, the list of values, the minimum value, the
9421        maximum value, the mean, the median, defaults to VAF
9422        :type info: str (optional)
9423        """
9424
9425        # if FORMAT and samples
9426        if (
9427            "FORMAT" in self.get_header_columns_as_list()
9428            and self.get_header_sample_list()
9429        ):
9430
9431            # vaf_stats annotation field
9432            vaf_stats_tag = info + "_stats"
9433
9434            # VCF infos tags
9435            vcf_infos_tags = {
9436                info + "_stats_nb": f"genotype {info} Statistics - number of {info}",
9437                info + "_stats_list": f"genotype {info} Statistics - list of {info}",
9438                info + "_stats_min": f"genotype {info} Statistics - min {info}",
9439                info + "_stats_max": f"genotype {info} Statistics - max {info}",
9440                info + "_stats_mean": f"genotype {info} Statistics - mean {info}",
9441                info + "_stats_mediane": f"genotype {info} Statistics - mediane {info}",
9442                info
9443                + "_stats_stdev": f"genotype {info} Statistics - standard deviation {info}",
9444            }
9445
9446            # Prefix
9447            prefix = self.get_explode_infos_prefix()
9448
9449            # Field
9450            vaf_stats_infos = prefix + vaf_stats_tag
9451
9452            # Variants table
9453            table_variants = self.get_table_variants()
9454
9455            # Header
9456            vcf_reader = self.get_header()
9457
9458            # Create variant id
9459            variant_id_column = self.get_variant_id_column()
9460            added_columns = [variant_id_column]
9461
9462            # variant_id, FORMAT and samples
9463            samples_fields = f" {variant_id_column}, FORMAT , " + " , ".join(
9464                self.get_header_sample_list()
9465            )
9466
9467            # Create dataframe
9468            dataframe_vaf_stats = self.get_query_to_df(
9469                f""" SELECT {samples_fields} FROM {table_variants} """
9470            )
9471
9472            # Create vaf_stats column
9473            dataframe_vaf_stats[vaf_stats_infos] = dataframe_vaf_stats.apply(
9474                lambda row: genotype_stats(
9475                    row, samples=self.get_header_sample_list(), info=info
9476                ),
9477                axis=1,
9478            )
9479
9480            # List of vcf tags
9481            sql_vaf_stats_fields = []
9482
9483            # Check all VAF stats infos
9484            for stat in vcf_infos_tags:
9485
9486                # Extract stats
9487                dataframe_vaf_stats[stat] = dataframe_vaf_stats[vaf_stats_infos].apply(
9488                    lambda x: dict(x).get(stat, "")
9489                )
9490
9491                # Add snpeff_hgvs to header
9492                vcf_reader.infos[stat] = vcf.parser._Info(
9493                    stat,
9494                    ".",
9495                    "String",
9496                    vcf_infos_tags.get(stat, "genotype statistics"),
9497                    "howard calculation",
9498                    "0",
9499                    self.code_type_map.get("String"),
9500                )
9501
9502                if len(sql_vaf_stats_fields):
9503                    sep = ";"
9504                else:
9505                    sep = ""
9506
9507                # Create fields to add in INFO
9508                sql_vaf_stats_fields.append(
9509                    f"""
9510                        CASE
9511                            WHEN dataframe_vaf_stats."{stat}" NOT NULL
9512                            THEN concat(
9513                                    '{sep}{stat}=',
9514                                    dataframe_vaf_stats."{stat}"
9515                                )
9516                            ELSE ''
9517                        END
9518                    """
9519                )
9520
9521            # SQL set for update
9522            sql_vaf_stats_fields_set = ",  ".join(sql_vaf_stats_fields)
9523
9524            # Update
9525            sql_update = f"""
9526                UPDATE {table_variants}
9527                SET "INFO" = 
9528                    concat(
9529                        CASE
9530                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9531                            THEN ''
9532                            ELSE concat("INFO", ';')
9533                        END,
9534                        {sql_vaf_stats_fields_set}
9535                    )
9536                FROM dataframe_vaf_stats
9537                WHERE {table_variants}."{variant_id_column}" = dataframe_vaf_stats."{variant_id_column}"
9538
9539            """
9540            self.conn.execute(sql_update)
9541
9542            # Remove added columns
9543            for added_column in added_columns:
9544                self.drop_column(column=added_column)
9545
9546            # Delete dataframe
9547            del dataframe_vaf_stats
9548            gc.collect()

The calculation_genotype_stats function calculates genotype statistics for a given information field in a VCF file and updates the INFO column of the variants table with the calculated statistics.

Parameters
  • info: The info parameter is a string that represents the type of information for which genotype statistics are calculated. It is used to generate various VCF info tags for the statistics, such as the number of occurrences, the list of values, the minimum value, the maximum value, the mean, the median, defaults to VAF
def calculation_transcripts_annotation(self, info_json: str = None, info_format: str = None) -> None:
9550    def calculation_transcripts_annotation(
9551        self, info_json: str = None, info_format: str = None
9552    ) -> None:
9553        """
9554        The `calculation_transcripts_annotation` function creates a transcripts table and adds an info
9555        field to it if transcripts are available.
9556
9557        :param info_json: The `info_json` parameter in the `calculation_transcripts_annotation` method
9558        is a string parameter that represents the information field to be used in the transcripts JSON.
9559        It is used to specify the JSON format for the transcripts information. If no value is provided
9560        when calling the method, it defaults to "
9561        :type info_json: str
9562        :param info_format: The `info_format` parameter in the `calculation_transcripts_annotation`
9563        method is a string parameter that specifies the format of the information field to be used in
9564        the transcripts JSON. It is used to define the format of the information field
9565        :type info_format: str
9566        """
9567
9568        # Create transcripts table
9569        transcripts_table = self.create_transcript_view()
9570
9571        # Add info field
9572        if transcripts_table:
9573            self.transcript_view_to_variants(
9574                transcripts_table=transcripts_table,
9575                transcripts_info_field_json=info_json,
9576                transcripts_info_field_format=info_format,
9577            )
9578        else:
9579            log.info("No Transcripts to process. Check param.json file configuration")

The calculation_transcripts_annotation function creates a transcripts table and adds an info field to it if transcripts are available.

Parameters
  • info_json: The info_json parameter in the calculation_transcripts_annotation method is a string parameter that represents the information field to be used in the transcripts JSON. It is used to specify the JSON format for the transcripts information. If no value is provided when calling the method, it defaults to "
  • info_format: The info_format parameter in the calculation_transcripts_annotation method is a string parameter that specifies the format of the information field to be used in the transcripts JSON. It is used to define the format of the information field
def calculation_transcripts_prioritization(self) -> None:
9581    def calculation_transcripts_prioritization(self) -> None:
9582        """
9583        The function `calculation_transcripts_prioritization` creates a transcripts table and
9584        prioritizes transcripts based on certain criteria.
9585        """
9586
9587        # Create transcripts table
9588        transcripts_table = self.create_transcript_view()
9589
9590        # Add info field
9591        if transcripts_table:
9592            self.transcripts_prioritization(transcripts_table=transcripts_table)
9593        else:
9594            log.info("No Transcripts to process. Check param.json file configuration")

The function calculation_transcripts_prioritization creates a transcripts table and prioritizes transcripts based on certain criteria.

def transcripts_prioritization(self, transcripts_table: str = None, param: dict = {}) -> bool:
9600    def transcripts_prioritization(
9601        self, transcripts_table: str = None, param: dict = {}
9602    ) -> bool:
9603        """
9604        The `transcripts_prioritization` function prioritizes transcripts based on certain parameters
9605        and updates the variants table with the prioritized information.
9606
9607        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name
9608        of the table containing transcripts data. If no value is provided, it defaults to "transcripts".
9609        This parameter is used to identify the table where the transcripts data is stored for the
9610        prioritization process
9611        :type transcripts_table: str
9612        :param param: The `param` parameter in the `transcripts_prioritization` method is a dictionary
9613        that contains various configuration settings for the prioritization process of transcripts. It
9614        is used to customize the behavior of the prioritization algorithm and includes settings such as
9615        the prefix for prioritization fields, default profiles, and other
9616        :type param: dict
9617        :return: The function `transcripts_prioritization` returns a boolean value `True` if the
9618        transcripts prioritization process is successfully completed, and `False` if there are any
9619        issues or if no profile is defined for transcripts prioritization.
9620        """
9621
9622        log.debug("Start transcripts prioritization...")
9623
9624        # Param
9625        if not param:
9626            param = self.get_param()
9627
9628        # Variants table
9629        table_variants = self.get_table_variants()
9630        log.debug(f"transcripts_table={transcripts_table}")
9631        # Transcripts table
9632        if transcripts_table is None:
9633            log.debug(f"transcripts_table={transcripts_table}")
9634            transcripts_table = self.create_transcript_view(
9635                transcripts_table="transcripts", param=param
9636            )
9637            log.debug(f"transcripts_table={transcripts_table}")
9638        if transcripts_table is None:
9639            msg_err = "No Transcripts table availalble"
9640            log.error(msg_err)
9641            raise ValueError(msg_err)
9642
9643        # Get transcripts columns
9644        columns_as_list_query = f"""
9645            DESCRIBE {transcripts_table}
9646        """
9647        columns_as_list = list(
9648            self.get_query_to_df(columns_as_list_query)["column_name"]
9649        )
9650
9651        # Create INFO if not exists
9652        if "INFO" not in columns_as_list:
9653            query_add_info = f"""
9654                ALTER TABLE {transcripts_table} ADD COLUMN INFO STRING DEFAULT '';
9655            """
9656            self.execute_query(query_add_info)
9657
9658        # Prioritization param and Force only PZ Score and Flag
9659        pz_param = param.get("transcripts", {}).get("prioritization", {})
9660        pz_fields_score = pz_param.get("pzprefix", "PTZ") + "Score"
9661        pz_fields_flag = pz_param.get("pzprefix", "PTZ") + "Flag"
9662        pz_fields_transcripts = pz_param.get("pzprefix", "PTZ") + "Transcript"
9663        pz_param["pzfields"] = [pz_fields_score, pz_fields_flag]
9664        pz_profile_default = (
9665            param.get("transcripts", {}).get("prioritization", {}).get("profiles", None)
9666        )
9667
9668        # Exit if no profile
9669        if pz_profile_default is None:
9670            log.warning("No profile defined for transcripts prioritization")
9671            return False
9672
9673        # Prioritization
9674        prioritization_result = self.prioritization(
9675            table=transcripts_table,
9676            pz_param=param.get("transcripts", {}).get("prioritization", {}),
9677        )
9678        if not prioritization_result:
9679            log.warning("Transcripts prioritization not processed")
9680            return False
9681
9682        # Explode PZ fields
9683        self.explode_infos(
9684            table=transcripts_table,
9685            fields=param.get("transcripts", {})
9686            .get("prioritization", {})
9687            .get("pzfields", []),
9688        )
9689
9690        # Export Transcripts prioritization infos to variants table
9691        query_update = f"""
9692            WITH RankedTranscripts AS (
9693                SELECT
9694                    "#CHROM", POS, REF, ALT, transcript, {pz_fields_score}, {pz_fields_flag},
9695                    ROW_NUMBER() OVER (
9696                        PARTITION BY "#CHROM", POS, REF, ALT
9697                        ORDER BY {pz_fields_flag} ASC, {pz_fields_score} DESC, transcript ASC
9698                    ) AS rn
9699                FROM
9700                    {transcripts_table}
9701            )
9702            UPDATE {table_variants}
9703                SET
9704                INFO = CONCAT(CASE
9705                            WHEN "INFO" IS NULL OR "INFO" IN ('','.')
9706                            THEN ''
9707                            ELSE concat("INFO", ';')
9708                        END,
9709                        concat('{pz_fields_transcripts}=', transcript, ';{pz_fields_score}=', {pz_fields_score}, ';{pz_fields_flag}=', {pz_fields_flag})
9710                        )
9711            FROM
9712                RankedTranscripts
9713            WHERE
9714                rn = 1
9715                AND variants."#CHROM" = RankedTranscripts."#CHROM"
9716                AND variants."POS" = RankedTranscripts."POS"
9717                AND variants."REF" = RankedTranscripts."REF"
9718                AND variants."ALT" = RankedTranscripts."ALT"
9719                
9720        """
9721        self.execute_query(query=query_update)
9722
9723        # Add PZ Transcript in header
9724        self.get_header().infos[pz_fields_transcripts] = vcf.parser._Info(
9725            pz_fields_transcripts,
9726            ".",
9727            "String",
9728            f"Transcript selected from transcripts prioritization process, profile {pz_profile_default}",
9729            "unknown",
9730            "unknown",
9731            code_type_map["String"],
9732        )
9733
9734        # Return
9735        return True

The transcripts_prioritization function prioritizes transcripts based on certain parameters and updates the variants table with the prioritized information.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing transcripts data. If no value is provided, it defaults to "transcripts". This parameter is used to identify the table where the transcripts data is stored for the prioritization process
  • param: The param parameter in the transcripts_prioritization method is a dictionary that contains various configuration settings for the prioritization process of transcripts. It is used to customize the behavior of the prioritization algorithm and includes settings such as the prefix for prioritization fields, default profiles, and other
Returns

The function transcripts_prioritization returns a boolean value True if the transcripts prioritization process is successfully completed, and False if there are any issues or if no profile is defined for transcripts prioritization.

def create_transcript_view_from_columns_map( self, transcripts_table: str = 'transcripts', columns_maps: dict = {}, added_columns: list = [], temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9737    def create_transcript_view_from_columns_map(
9738        self,
9739        transcripts_table: str = "transcripts",
9740        columns_maps: dict = {},
9741        added_columns: list = [],
9742        temporary_tables: list = None,
9743        annotation_fields: list = None,
9744    ) -> tuple[list, list, list]:
9745        """
9746        The `create_transcript_view_from_columns_map` function generates a temporary table view based on
9747        specified columns mapping for transcripts data.
9748
9749        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9750        the table where the transcripts data is stored or will be stored in the database. This table
9751        typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores,
9752        predictions, etc. It defaults to "transcripts, defaults to transcripts
9753        :type transcripts_table: str (optional)
9754        :param columns_maps: The `columns_maps` parameter is a dictionary that contains information about
9755        how to map columns from a transcripts table to create a view. Each entry in the `columns_maps` list
9756        represents a mapping configuration for a specific set of columns. It typically includes details such
9757        as the main transcript column and additional information columns
9758        :type columns_maps: dict
9759        :param added_columns: The `added_columns` parameter in the `create_transcript_view_from_columns_map`
9760        function is a list that stores the additional columns that will be added to the view being created
9761        based on the columns map provided. These columns are generated by exploding the transcript
9762        information columns along with the main transcript column
9763        :type added_columns: list
9764        :param temporary_tables: The `temporary_tables` parameter in the
9765        `create_transcript_view_from_columns_map` function is a list that stores the names of temporary
9766        tables created during the process of creating a transcript view from a columns map. These temporary
9767        tables are used to store intermediate results or transformations before the final view is generated
9768        :type temporary_tables: list
9769        :param annotation_fields: The `annotation_fields` parameter in the
9770        `create_transcript_view_from_columns_map` function is a list that stores the fields that are used
9771        for annotation in the query view creation process. These fields are extracted from the
9772        `transcripts_column` and `transcripts_infos_columns` specified in the `columns
9773        :type annotation_fields: list
9774        :return: The function `create_transcript_view_from_columns_map` returns a tuple containing three
9775        lists: `added_columns`, `temporary_tables`, and `annotation_fields`.
9776        """
9777
9778        log.debug("Start transcrpts view creation from columns map...")
9779
9780        # "from_columns_map": [
9781        #     {
9782        #         "transcripts_column": "Ensembl_transcriptid",
9783        #         "transcripts_infos_columns": [
9784        #             "genename",
9785        #             "Ensembl_geneid",
9786        #             "LIST_S2_score",
9787        #             "LIST_S2_pred",
9788        #         ],
9789        #     },
9790        #     {
9791        #         "transcripts_column": "Ensembl_transcriptid",
9792        #         "transcripts_infos_columns": [
9793        #             "genename",
9794        #             "VARITY_R_score",
9795        #             "Aloft_pred",
9796        #         ],
9797        #     },
9798        # ],
9799
9800        # Init
9801        if temporary_tables is None:
9802            temporary_tables = []
9803        if annotation_fields is None:
9804            annotation_fields = []
9805
9806        # Variants table
9807        table_variants = self.get_table_variants()
9808
9809        for columns_map in columns_maps:
9810
9811            # Transcript column
9812            transcripts_column = columns_map.get("transcripts_column", None)
9813
9814            # Transcripts infos columns
9815            transcripts_infos_columns = columns_map.get("transcripts_infos_columns", [])
9816
9817            if transcripts_column is not None:
9818
9819                # Explode
9820                added_columns += self.explode_infos(
9821                    fields=[transcripts_column] + transcripts_infos_columns
9822                )
9823
9824                # View clauses
9825                clause_select = []
9826                for field in [transcripts_column] + transcripts_infos_columns:
9827                    clause_select.append(
9828                        f""" regexp_split_to_table("{field}", ',') AS '{field}' """
9829                    )
9830                    if field not in [transcripts_column]:
9831                        annotation_fields.append(field)
9832
9833                # Querey View
9834                query = f""" 
9835                    SELECT
9836                        "#CHROM", POS, REF, ALT, INFO,
9837                        "{transcripts_column}" AS 'transcript',
9838                        {", ".join(clause_select)}
9839                    FROM (
9840                        SELECT 
9841                            "#CHROM", POS, REF, ALT, INFO,
9842                            {", ".join(clause_select)}
9843                        FROM {table_variants}
9844                        )
9845                    WHERE "{transcripts_column}" IS NOT NULL
9846                """
9847
9848                # Create temporary table
9849                temporary_table = transcripts_table + "".join(
9850                    random.choices(string.ascii_uppercase + string.digits, k=10)
9851                )
9852
9853                # Temporary_tables
9854                temporary_tables.append(temporary_table)
9855                query_view = f"""
9856                    CREATE TEMPORARY TABLE {temporary_table}
9857                    AS ({query})
9858                """
9859                self.execute_query(query=query_view)
9860
9861        return added_columns, temporary_tables, annotation_fields

The create_transcript_view_from_columns_map function generates a temporary table view based on specified columns mapping for transcripts data.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table where the transcripts data is stored or will be stored in the database. This table typically contains information about transcripts such as Ensembl transcript IDs, gene names, scores, predictions, etc. It defaults to "transcripts, defaults to transcripts
  • columns_maps: The columns_maps parameter is a dictionary that contains information about how to map columns from a transcripts table to create a view. Each entry in the columns_maps list represents a mapping configuration for a specific set of columns. It typically includes details such as the main transcript column and additional information columns
  • added_columns: The added_columns parameter in the create_transcript_view_from_columns_map function is a list that stores the additional columns that will be added to the view being created based on the columns map provided. These columns are generated by exploding the transcript information columns along with the main transcript column
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_columns_map function is a list that stores the names of temporary tables created during the process of creating a transcript view from a columns map. These temporary tables are used to store intermediate results or transformations before the final view is generated
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_columns_map function is a list that stores the fields that are used for annotation in the query view creation process. These fields are extracted from the transcripts_column and transcripts_infos_columns specified in the `columns
Returns

The function create_transcript_view_from_columns_map returns a tuple containing three lists: added_columns, temporary_tables, and annotation_fields.

def create_transcript_view_from_column_format( self, transcripts_table: str = 'transcripts', column_formats: dict = {}, temporary_tables: list = None, annotation_fields: list = None) -> tuple[list, list, list]:
9863    def create_transcript_view_from_column_format(
9864        self,
9865        transcripts_table: str = "transcripts",
9866        column_formats: dict = {},
9867        temporary_tables: list = None,
9868        annotation_fields: list = None,
9869    ) -> tuple[list, list, list]:
9870        """
9871        The `create_transcript_view_from_column_format` function generates a transcript view based on
9872        specified column formats, adds additional columns and annotation fields, and returns the list of
9873        temporary tables and annotation fields.
9874
9875        :param transcripts_table: The `transcripts_table` parameter is a string that specifies the name of
9876        the table containing the transcripts data. This table will be used as the base table for creating
9877        the transcript view. The default value for this parameter is "transcripts", but you can provide a
9878        different table name if needed, defaults to transcripts
9879        :type transcripts_table: str (optional)
9880        :param column_formats: The `column_formats` parameter is a dictionary that contains information
9881        about the columns to be used for creating the transcript view. Each entry in the dictionary
9882        specifies the mapping between a transcripts column and a transcripts infos column. For example, in
9883        the provided code snippet:
9884        :type column_formats: dict
9885        :param temporary_tables: The `temporary_tables` parameter in the
9886        `create_transcript_view_from_column_format` function is a list that stores the names of temporary
9887        views created during the process of creating a transcript view from a column format. These temporary
9888        views are used to manipulate and extract data before generating the final transcript view. It
9889        :type temporary_tables: list
9890        :param annotation_fields: The `annotation_fields` parameter in the
9891        `create_transcript_view_from_column_format` function is a list that stores the annotation fields
9892        that are extracted from the temporary views created during the process. These annotation fields are
9893        obtained by querying the temporary views and extracting the column names excluding specific columns
9894        like `#CH
9895        :type annotation_fields: list
9896        :return: The `create_transcript_view_from_column_format` function returns two lists:
9897        `temporary_tables` and `annotation_fields`.
9898        """
9899
9900        log.debug("Start transcrpts view creation from column format...")
9901
9902        #  "from_column_format": [
9903        #     {
9904        #         "transcripts_column": "ANN",
9905        #         "transcripts_infos_column": "Feature_ID",
9906        #     }
9907        # ],
9908
9909        # Init
9910        if temporary_tables is None:
9911            temporary_tables = []
9912        if annotation_fields is None:
9913            annotation_fields = []
9914
9915        for column_format in column_formats:
9916
9917            # annotation field and transcript annotation field
9918            annotation_field = column_format.get("transcripts_column", "ANN")
9919            transcript_annotation = column_format.get(
9920                "transcripts_infos_column", "Feature_ID"
9921            )
9922
9923            # Temporary View name
9924            temporary_view_name = transcripts_table + "".join(
9925                random.choices(string.ascii_uppercase + string.digits, k=10)
9926            )
9927
9928            # Create temporary view name
9929            temporary_view_name = self.annotation_format_to_table(
9930                uniquify=True,
9931                annotation_field=annotation_field,
9932                view_name=temporary_view_name,
9933                annotation_id=transcript_annotation,
9934            )
9935
9936            # Annotation fields
9937            if temporary_view_name:
9938                query_annotation_fields = f"""
9939                    SELECT *
9940                    FROM (
9941                        DESCRIBE SELECT *
9942                        FROM {temporary_view_name}
9943                        )
9944                        WHERE column_name not in ('#CHROM', 'POS', 'REF', 'ALT')
9945                """
9946                df_annotation_fields = self.get_query_to_df(
9947                    query=query_annotation_fields
9948                )
9949
9950                # Add temporary view and annotation fields
9951                temporary_tables.append(temporary_view_name)
9952                annotation_fields += list(set(df_annotation_fields["column_name"]))
9953
9954        return temporary_tables, annotation_fields

The create_transcript_view_from_column_format function generates a transcript view based on specified column formats, adds additional columns and annotation fields, and returns the list of temporary tables and annotation fields.

Parameters
  • transcripts_table: The transcripts_table parameter is a string that specifies the name of the table containing the transcripts data. This table will be used as the base table for creating the transcript view. The default value for this parameter is "transcripts", but you can provide a different table name if needed, defaults to transcripts
  • column_formats: The column_formats parameter is a dictionary that contains information about the columns to be used for creating the transcript view. Each entry in the dictionary specifies the mapping between a transcripts column and a transcripts infos column. For example, in the provided code snippet:
  • temporary_tables: The temporary_tables parameter in the create_transcript_view_from_column_format function is a list that stores the names of temporary views created during the process of creating a transcript view from a column format. These temporary views are used to manipulate and extract data before generating the final transcript view. It
  • annotation_fields: The annotation_fields parameter in the create_transcript_view_from_column_format function is a list that stores the annotation fields that are extracted from the temporary views created during the process. These annotation fields are obtained by querying the temporary views and extracting the column names excluding specific columns like `#CH
Returns

The create_transcript_view_from_column_format function returns two lists: temporary_tables and annotation_fields.

def create_transcript_view( self, transcripts_table: str = None, transcripts_table_drop: bool = True, param: dict = {}) -> str:
 9956    def create_transcript_view(
 9957        self,
 9958        transcripts_table: str = None,
 9959        transcripts_table_drop: bool = True,
 9960        param: dict = {},
 9961    ) -> str:
 9962        """
 9963        The `create_transcript_view` function generates a transcript view by processing data from a
 9964        specified table based on provided parameters and structural information.
 9965
 9966        :param transcripts_table: The `transcripts_table` parameter in the `create_transcript_view` function
 9967        is used to specify the name of the table that will store the final transcript view data. If a table
 9968        name is not provided, the function will create a new table to store the transcript view data, and by
 9969        default,, defaults to transcripts
 9970        :type transcripts_table: str (optional)
 9971        :param transcripts_table_drop: The `transcripts_table_drop` parameter in the
 9972        `create_transcript_view` function is a boolean parameter that determines whether to drop the
 9973        existing transcripts table before creating a new one. If `transcripts_table_drop` is set to `True`,
 9974        the function will drop the existing transcripts table if it exists, defaults to True
 9975        :type transcripts_table_drop: bool (optional)
 9976        :param param: The `param` parameter in the `create_transcript_view` function is a dictionary that
 9977        contains information needed to create a transcript view. It includes details such as the structure
 9978        of the transcripts, columns mapping, column formats, and other necessary information for generating
 9979        the view. This parameter allows for flexibility and customization
 9980        :type param: dict
 9981        :return: The `create_transcript_view` function returns the name of the transcripts table that was
 9982        created or modified during the execution of the function.
 9983        """
 9984
 9985        log.debug("Start transcripts view creation...")
 9986
 9987        # Default
 9988        transcripts_table_default = "transcripts"
 9989
 9990        # Param
 9991        if not param:
 9992            param = self.get_param()
 9993
 9994        # Struct
 9995        struct = param.get("transcripts", {}).get("struct", None)
 9996
 9997        if struct:
 9998
 9999            # Transcripts table
10000            if transcripts_table is None:
10001                transcripts_table = param.get("transcripts", {}).get(
10002                    "table", transcripts_table_default
10003                )
10004
10005            # added_columns
10006            added_columns = []
10007
10008            # Temporary tables
10009            temporary_tables = []
10010
10011            # Annotation fields
10012            annotation_fields = []
10013
10014            # from columns map
10015            columns_maps = struct.get("from_columns_map", [])
10016            added_columns_tmp, temporary_tables_tmp, annotation_fields_tmp = (
10017                self.create_transcript_view_from_columns_map(
10018                    transcripts_table=transcripts_table,
10019                    columns_maps=columns_maps,
10020                    added_columns=added_columns,
10021                    temporary_tables=temporary_tables,
10022                    annotation_fields=annotation_fields,
10023                )
10024            )
10025            added_columns += added_columns_tmp
10026            temporary_tables += temporary_tables_tmp
10027            annotation_fields += annotation_fields_tmp
10028
10029            # from column format
10030            column_formats = struct.get("from_column_format", [])
10031            temporary_tables_tmp, annotation_fields_tmp = (
10032                self.create_transcript_view_from_column_format(
10033                    transcripts_table=transcripts_table,
10034                    column_formats=column_formats,
10035                    temporary_tables=temporary_tables,
10036                    annotation_fields=annotation_fields,
10037                )
10038            )
10039            temporary_tables += temporary_tables_tmp
10040            annotation_fields += annotation_fields_tmp
10041
10042            # Merge temporary tables query
10043            query_merge = ""
10044            for temporary_table in temporary_tables:
10045
10046                # First temporary table
10047                if not query_merge:
10048                    query_merge = f"""
10049                        SELECT * FROM {temporary_table}
10050                    """
10051                # other temporary table (using UNION)
10052                else:
10053                    query_merge += f"""
10054                        UNION BY NAME SELECT * FROM {temporary_table}
10055                    """
10056
10057            # Merge on transcript
10058            query_merge_on_transcripts_annotation_fields = []
10059            # Aggregate all annotations fields
10060            for annotation_field in set(annotation_fields):
10061                query_merge_on_transcripts_annotation_fields.append(
10062                    f""" list_aggregate(list_distinct(array_agg({annotation_field})), 'string_agg', ',') AS {annotation_field} """
10063                )
10064            # Query for transcripts view
10065            query_merge_on_transcripts = f"""
10066                SELECT "#CHROM", POS, REF, ALT, INFO, transcript, {", ".join(query_merge_on_transcripts_annotation_fields)}
10067                FROM ({query_merge})
10068                GROUP BY "#CHROM", POS, REF, ALT, INFO, transcript
10069            """
10070
10071            # Drop transcript view is necessary
10072            if transcripts_table_drop:
10073                query_drop = f"""
10074                    DROP TABLE IF EXISTS {transcripts_table};
10075                """
10076                self.execute_query(query=query_drop)
10077
10078            # Merge and create transcript view
10079            query_create_view = f"""
10080                CREATE TABLE IF NOT EXISTS {transcripts_table}
10081                AS {query_merge_on_transcripts}
10082            """
10083            self.execute_query(query=query_create_view)
10084
10085            # Remove added columns
10086            for added_column in added_columns:
10087                self.drop_column(column=added_column)
10088
10089        else:
10090
10091            transcripts_table = None
10092
10093        return transcripts_table

The create_transcript_view function generates a transcript view by processing data from a specified table based on provided parameters and structural information.

Parameters
  • transcripts_table: The transcripts_table parameter in the create_transcript_view function is used to specify the name of the table that will store the final transcript view data. If a table name is not provided, the function will create a new table to store the transcript view data, and by default,, defaults to transcripts
  • transcripts_table_drop: The transcripts_table_drop parameter in the create_transcript_view function is a boolean parameter that determines whether to drop the existing transcripts table before creating a new one. If transcripts_table_drop is set to True, the function will drop the existing transcripts table if it exists, defaults to True
  • param: The param parameter in the create_transcript_view function is a dictionary that contains information needed to create a transcript view. It includes details such as the structure of the transcripts, columns mapping, column formats, and other necessary information for generating the view. This parameter allows for flexibility and customization
Returns

The create_transcript_view function returns the name of the transcripts table that was created or modified during the execution of the function.

def annotation_format_to_table( self, uniquify: bool = True, annotation_field: str = 'ANN', annotation_id: str = 'Feature_ID', view_name: str = 'transcripts') -> str:
10095    def annotation_format_to_table(
10096        self,
10097        uniquify: bool = True,
10098        annotation_field: str = "ANN",
10099        annotation_id: str = "Feature_ID",
10100        view_name: str = "transcripts",
10101    ) -> str:
10102        """
10103        The function `annotation_format_to_table` converts annotation data from a VCF file into a structured
10104        table format.
10105
10106        :param uniquify: The `uniquify` parameter is a boolean flag that determines whether to ensure unique
10107        values in the output or not. If set to `True`, the function will make sure that the output values
10108        are unique, defaults to True
10109        :type uniquify: bool (optional)
10110        :param annotation_field: The `annotation_field` parameter refers to the field in the VCF file that
10111        contains the annotation information for each variant. This field is used to extract the annotation
10112        details for further processing in the function, defaults to ANN
10113        :type annotation_field: str (optional)
10114        :param annotation_id: The `annotation_id` parameter in the `annotation_format_to_table` method is
10115        used to specify the identifier for the annotation feature. This identifier will be used as a column
10116        name in the resulting table or view that is created based on the annotation data. It helps in
10117        uniquely identifying each annotation entry in the, defaults to Feature_ID
10118        :type annotation_id: str (optional)
10119        :param view_name: The `view_name` parameter in the `annotation_format_to_table` method is used to
10120        specify the name of the temporary table that will be created to store the transformed annotation
10121        data. This table will hold the extracted information from the annotation field in a structured
10122        format for further processing or analysis, defaults to transcripts
10123        :type view_name: str (optional)
10124        :return: The function `annotation_format_to_table` is returning the name of the view created, which
10125        is stored in the variable `view_name`.
10126        """
10127
10128        # Annotation field
10129        annotation_format = "annotation_explode"
10130
10131        # Transcript annotation
10132        annotation_id = "".join(char for char in annotation_id if char.isalnum())
10133
10134        # Prefix
10135        prefix = self.get_explode_infos_prefix()
10136        if prefix:
10137            prefix = "INFO/"
10138
10139        # Annotation fields
10140        annotation_infos = prefix + annotation_field
10141        annotation_format_infos = prefix + annotation_format
10142
10143        # Variants table
10144        table_variants = self.get_table_variants()
10145
10146        # Header
10147        vcf_reader = self.get_header()
10148
10149        # Add columns
10150        added_columns = []
10151
10152        # Explode HGVS field in column
10153        added_columns += self.explode_infos(fields=[annotation_field])
10154
10155        if annotation_field in vcf_reader.infos:
10156
10157            # Extract ANN header
10158            ann_description = vcf_reader.infos[annotation_field].desc
10159            pattern = r"'(.+?)'"
10160            match = re.search(pattern, ann_description)
10161            if match:
10162                ann_header_match = match.group(1).split(" | ")
10163                ann_header = []
10164                ann_header_desc = {}
10165                for i in range(len(ann_header_match)):
10166                    ann_header_info = "".join(
10167                        char for char in ann_header_match[i] if char.isalnum()
10168                    )
10169                    ann_header.append(ann_header_info)
10170                    ann_header_desc[ann_header_info] = ann_header_match[i]
10171                if not ann_header_desc:
10172                    raise ValueError("Invalid header description format")
10173            else:
10174                raise ValueError("Invalid header description format")
10175
10176            # Create variant id
10177            variant_id_column = self.get_variant_id_column()
10178            added_columns += [variant_id_column]
10179
10180            # Create dataframe
10181            dataframe_annotation_format = self.get_query_to_df(
10182                f""" SELECT "#CHROM", POS, REF, ALT, INFO, "{variant_id_column}", "{annotation_infos}" FROM {table_variants} """
10183            )
10184
10185            # Create annotation columns
10186            dataframe_annotation_format[
10187                annotation_format_infos
10188            ] = dataframe_annotation_format[annotation_infos].apply(
10189                lambda x: explode_annotation_format(
10190                    annotation=str(x),
10191                    uniquify=uniquify,
10192                    output_format="JSON",
10193                    prefix="",
10194                    header=list(ann_header_desc.values()),
10195                )
10196            )
10197
10198            # Find keys
10199            query_json = f"""SELECT distinct(unnest(json_keys({annotation_format}, '$.0'))) AS 'key' FROM dataframe_annotation_format;"""
10200            df_keys = self.get_query_to_df(query=query_json)
10201
10202            # Check keys
10203            query_json_key = []
10204            for _, row in df_keys.iterrows():
10205
10206                # Key
10207                key = row.iloc[0]
10208
10209                # key_clean
10210                key_clean = "".join(char for char in key if char.isalnum())
10211
10212                # Type
10213                query_json_type = f"""SELECT unnest(json_extract_string({annotation_format}, '$.*."{key}"')) AS '{key_clean}' FROM dataframe_annotation_format WHERE trim('{key}') NOT IN ('');"""
10214
10215                # Get DataFrame from query
10216                df_json_type = self.get_query_to_df(query=query_json_type)
10217
10218                # Fill missing values with empty strings and then replace empty strings or None with NaN and drop rows with NaN
10219                with pd.option_context("future.no_silent_downcasting", True):
10220                    df_json_type.fillna(value="", inplace=True)
10221                    replace_dict = {None: np.nan, "": np.nan}
10222                    df_json_type.replace(replace_dict, inplace=True)
10223                    df_json_type.dropna(inplace=True)
10224
10225                # Detect column type
10226                column_type = detect_column_type(df_json_type[key_clean])
10227
10228                # Append
10229                query_json_key.append(
10230                    f"""NULLIF(unnest(json_extract_string({annotation_format}, '$.*."{key}"')), '')::{column_type}  AS '{prefix}{key_clean}' """
10231                )
10232
10233            # Create view
10234            query_view = f"""
10235                CREATE TEMPORARY TABLE {view_name}
10236                AS (
10237                    SELECT *, {annotation_id} AS 'transcript'
10238                    FROM (
10239                        SELECT "#CHROM", POS, REF, ALT, INFO, {",".join(query_json_key)}
10240                        FROM dataframe_annotation_format
10241                        )
10242                    );
10243            """
10244            self.execute_query(query=query_view)
10245
10246        else:
10247
10248            # Return None
10249            view_name = None
10250
10251        # Remove added columns
10252        for added_column in added_columns:
10253            self.drop_column(column=added_column)
10254
10255        return view_name

The function annotation_format_to_table converts annotation data from a VCF file into a structured table format.

Parameters
  • uniquify: The uniquify parameter is a boolean flag that determines whether to ensure unique values in the output or not. If set to True, the function will make sure that the output values are unique, defaults to True
  • annotation_field: The annotation_field parameter refers to the field in the VCF file that contains the annotation information for each variant. This field is used to extract the annotation details for further processing in the function, defaults to ANN
  • annotation_id: The annotation_id parameter in the annotation_format_to_table method is used to specify the identifier for the annotation feature. This identifier will be used as a column name in the resulting table or view that is created based on the annotation data. It helps in uniquely identifying each annotation entry in the, defaults to Feature_ID
  • view_name: The view_name parameter in the annotation_format_to_table method is used to specify the name of the temporary table that will be created to store the transformed annotation data. This table will hold the extracted information from the annotation field in a structured format for further processing or analysis, defaults to transcripts
Returns

The function annotation_format_to_table is returning the name of the view created, which is stored in the variable view_name.

def transcript_view_to_variants( self, transcripts_table: str = None, transcripts_column_id: str = None, transcripts_info_json: str = None, transcripts_info_field_json: str = None, transcripts_info_format: str = None, transcripts_info_field_format: str = None, param: dict = {}) -> bool:
10257    def transcript_view_to_variants(
10258        self,
10259        transcripts_table: str = None,
10260        transcripts_column_id: str = None,
10261        transcripts_info_json: str = None,
10262        transcripts_info_field_json: str = None,
10263        transcripts_info_format: str = None,
10264        transcripts_info_field_format: str = None,
10265        param: dict = {},
10266    ) -> bool:
10267        """
10268        The `transcript_view_to_variants` function updates a variants table with information from
10269        transcripts in JSON format.
10270
10271        :param transcripts_table: The `transcripts_table` parameter is used to specify the name of the
10272        table containing the transcripts data. If this parameter is not provided, the function will
10273        attempt to retrieve it from the `param` dictionary or use a default value of "transcripts"
10274        :type transcripts_table: str
10275        :param transcripts_column_id: The `transcripts_column_id` parameter is used to specify the
10276        column in the `transcripts_table` that contains the unique identifier for each transcript. This
10277        identifier is used to match transcripts with variants in the database
10278        :type transcripts_column_id: str
10279        :param transcripts_info_json: The `transcripts_info_json` parameter is used to specify the name
10280        of the column in the variants table where the transcripts information will be stored in JSON
10281        format. This parameter allows you to define the column in the variants table that will hold the
10282        JSON-formatted information about transcripts
10283        :type transcripts_info_json: str
10284        :param transcripts_info_field_json: The `transcripts_info_field_json` parameter is used to
10285        specify the field in the VCF header that will contain information about transcripts in JSON
10286        format. This field will be added to the VCF header as an INFO field with the specified name
10287        :type transcripts_info_field_json: str
10288        :param transcripts_info_format: The `transcripts_info_format` parameter is used to specify the
10289        format of the information about transcripts that will be stored in the variants table. This
10290        format can be used to define how the transcript information will be structured or displayed
10291        within the variants table
10292        :type transcripts_info_format: str
10293        :param transcripts_info_field_format: The `transcripts_info_field_format` parameter is used to
10294        specify the field in the VCF header that will contain information about transcripts in a
10295        specific format. This field will be added to the VCF header as an INFO field with the specified
10296        name
10297        :type transcripts_info_field_format: str
10298        :param param: The `param` parameter in the `transcript_view_to_variants` method is a dictionary
10299        that contains various configuration settings related to transcripts. It is used to provide
10300        default values for certain parameters if they are not explicitly provided when calling the
10301        method. The `param` dictionary can be passed as an argument
10302        :type param: dict
10303        :return: The function `transcript_view_to_variants` returns a boolean value. It returns `True`
10304        if the operation is successful and `False` if certain conditions are not met.
10305        """
10306
10307        msg_info_prefix = "Start transcripts view to variants annotations"
10308
10309        log.debug(f"{msg_info_prefix}...")
10310
10311        # Default
10312        transcripts_table_default = "transcripts"
10313        transcripts_column_id_default = "transcript"
10314        transcripts_info_json_default = None
10315        transcripts_info_format_default = None
10316        transcripts_info_field_json_default = None
10317        transcripts_info_field_format_default = None
10318
10319        # Param
10320        if not param:
10321            param = self.get_param()
10322
10323        # Transcripts table
10324        if transcripts_table is None:
10325            transcripts_table = param.get("transcripts", {}).get(
10326                "table", transcripts_table_default
10327            )
10328
10329        # Transcripts column ID
10330        if transcripts_column_id is None:
10331            transcripts_column_id = param.get("transcripts", {}).get(
10332                "column_id", transcripts_column_id_default
10333            )
10334
10335        # Transcripts info json
10336        if transcripts_info_json is None:
10337            transcripts_info_json = param.get("transcripts", {}).get(
10338                "transcripts_info_json", transcripts_info_json_default
10339            )
10340
10341        # Transcripts info field JSON
10342        if transcripts_info_field_json is None:
10343            transcripts_info_field_json = param.get("transcripts", {}).get(
10344                "transcripts_info_field_json", transcripts_info_field_json_default
10345            )
10346        # if transcripts_info_field_json is not None and transcripts_info_json is None:
10347        #     transcripts_info_json = transcripts_info_field_json
10348
10349        # Transcripts info format
10350        if transcripts_info_format is None:
10351            transcripts_info_format = param.get("transcripts", {}).get(
10352                "transcripts_info_format", transcripts_info_format_default
10353            )
10354
10355        # Transcripts info field FORMAT
10356        if transcripts_info_field_format is None:
10357            transcripts_info_field_format = param.get("transcripts", {}).get(
10358                "transcripts_info_field_format", transcripts_info_field_format_default
10359            )
10360        # if (
10361        #     transcripts_info_field_format is not None
10362        #     and transcripts_info_format is None
10363        # ):
10364        #     transcripts_info_format = transcripts_info_field_format
10365
10366        # Variants table
10367        table_variants = self.get_table_variants()
10368
10369        # Check info columns param
10370        if (
10371            transcripts_info_json is None
10372            and transcripts_info_field_json is None
10373            and transcripts_info_format is None
10374            and transcripts_info_field_format is None
10375        ):
10376            return False
10377
10378        # Transcripts infos columns
10379        query_transcripts_infos_columns = f"""
10380            SELECT *
10381            FROM (
10382                DESCRIBE SELECT * FROM {transcripts_table}
10383                )
10384            WHERE "column_name" NOT IN ('#CHROM', 'POS', 'REF', 'ALT', '{transcripts_column_id}')
10385        """
10386        transcripts_infos_columns = list(
10387            self.get_query_to_df(query=query_transcripts_infos_columns)["column_name"]
10388        )
10389
10390        # View results
10391        clause_select = []
10392        clause_to_json = []
10393        clause_to_format = []
10394        for field in transcripts_infos_columns:
10395            clause_select.append(
10396                f""" regexp_split_to_table("{field}", ',') AS '{field}' """
10397            )
10398            clause_to_json.append(f""" '{field}': "{field}" """)
10399            clause_to_format.append(f""" "{field}" """)
10400
10401        # Update
10402        update_set_json = []
10403        update_set_format = []
10404
10405        # VCF header
10406        vcf_reader = self.get_header()
10407
10408        # Transcripts to info column in JSON
10409        if transcripts_info_json is not None:
10410
10411            # Create column on variants table
10412            self.add_column(
10413                table_name=table_variants,
10414                column_name=transcripts_info_json,
10415                column_type="JSON",
10416                default_value=None,
10417                drop=False,
10418            )
10419
10420            # Add header
10421            vcf_reader.infos[transcripts_info_json] = vcf.parser._Info(
10422                transcripts_info_json,
10423                ".",
10424                "String",
10425                "Transcripts in JSON format",
10426                "unknwon",
10427                "unknwon",
10428                self.code_type_map["String"],
10429            )
10430
10431            # Add to update
10432            update_set_json.append(
10433                f""" {transcripts_info_json}=t.{transcripts_info_json} """
10434            )
10435
10436        # Transcripts to info field in JSON
10437        if transcripts_info_field_json is not None:
10438
10439            log.debug(f"{msg_info_prefix} - Annotation in JSON format...")
10440
10441            # Add to update
10442            update_set_json.append(
10443                f""" 
10444                    INFO = concat(
10445                            CASE
10446                                WHEN INFO NOT IN ('', '.')
10447                                THEN INFO
10448                                ELSE ''
10449                            END,
10450                            CASE
10451                                WHEN CAST(t.{transcripts_info_json} AS VARCHAR) NOT IN ('', '.')
10452                                THEN concat(
10453                                    ';{transcripts_info_field_json}=',
10454                                    t.{transcripts_info_json}
10455                                )
10456                                ELSE ''
10457                            END
10458                            )
10459                """
10460            )
10461
10462            # Add header
10463            vcf_reader.infos[transcripts_info_field_json] = vcf.parser._Info(
10464                transcripts_info_field_json,
10465                ".",
10466                "String",
10467                "Transcripts in JSON format",
10468                "unknwon",
10469                "unknwon",
10470                self.code_type_map["String"],
10471            )
10472
10473        if update_set_json:
10474
10475            # Update query
10476            query_update = f"""
10477                UPDATE {table_variants}
10478                    SET {", ".join(update_set_json)}
10479                FROM
10480                (
10481                    SELECT
10482                        "#CHROM", POS, REF, ALT,
10483                            concat(
10484                            '{{',
10485                            string_agg(
10486                                '"' || "{transcripts_column_id}" || '":' ||
10487                                to_json(json_output)
10488                            ),
10489                            '}}'
10490                            )::JSON AS {transcripts_info_json}
10491                    FROM
10492                        (
10493                        SELECT
10494                            "#CHROM", POS, REF, ALT,
10495                            "{transcripts_column_id}",
10496                            to_json(
10497                                {{{",".join(clause_to_json)}}}
10498                            )::JSON AS json_output
10499                        FROM
10500                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10501                        WHERE "{transcripts_column_id}" IS NOT NULL
10502                        )
10503                    GROUP BY "#CHROM", POS, REF, ALT
10504                ) AS t
10505                WHERE {table_variants}."#CHROM" = t."#CHROM"
10506                    AND {table_variants}."POS" = t."POS"
10507                    AND {table_variants}."REF" = t."REF"
10508                    AND {table_variants}."ALT" = t."ALT"
10509            """
10510
10511            self.execute_query(query=query_update)
10512
10513        # Transcripts to info column in FORMAT
10514        if transcripts_info_format is not None:
10515
10516            # Create column on variants table
10517            self.add_column(
10518                table_name=table_variants,
10519                column_name=transcripts_info_format,
10520                column_type="VARCHAR",
10521                default_value=None,
10522                drop=False,
10523            )
10524
10525            # Add header
10526            vcf_reader.infos[transcripts_info_format] = vcf.parser._Info(
10527                transcripts_info_format,
10528                ".",
10529                "String",
10530                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10531                "unknwon",
10532                "unknwon",
10533                self.code_type_map["String"],
10534            )
10535
10536            # Add to update
10537            update_set_format.append(
10538                f""" {transcripts_info_format}=t.{transcripts_info_format} """
10539            )
10540
10541        # Transcripts to info field in JSON
10542        if transcripts_info_field_format is not None:
10543
10544            log.debug(f"{msg_info_prefix} - Annotation in structured format...")
10545
10546            # Add to update
10547            update_set_format.append(
10548                f""" 
10549                    INFO = concat(
10550                            CASE
10551                                WHEN INFO NOT IN ('', '.')
10552                                THEN INFO
10553                                ELSE ''
10554                            END,
10555                            CASE
10556                                WHEN CAST(t.{transcripts_info_format} AS VARCHAR) NOT IN ('', '.')
10557                                THEN concat(
10558                                    ';{transcripts_info_field_format}=',
10559                                    t.{transcripts_info_format}
10560                                )
10561                                ELSE ''
10562                            END
10563                            )
10564                """
10565            )
10566
10567            # Add header
10568            vcf_reader.infos[transcripts_info_field_format] = vcf.parser._Info(
10569                transcripts_info_field_format,
10570                ".",
10571                "String",
10572                f"Transcripts annotations: 'transcript | {' | '.join(transcripts_infos_columns)}'",
10573                "unknwon",
10574                "unknwon",
10575                self.code_type_map["String"],
10576            )
10577
10578        if update_set_format:
10579
10580            # Update query
10581            query_update = f"""
10582                UPDATE {table_variants}
10583                    SET {", ".join(update_set_format)}
10584                FROM
10585                (
10586                    SELECT
10587                        "#CHROM", POS, REF, ALT,
10588                            string_agg({transcripts_info_format}) AS {transcripts_info_format}
10589                    FROM 
10590                        (
10591                        SELECT
10592                            "#CHROM", POS, REF, ALT,
10593                            "{transcripts_column_id}",
10594                            concat(
10595                                "{transcripts_column_id}",
10596                                '|',
10597                                {", '|', ".join(clause_to_format)}
10598                            ) AS {transcripts_info_format}
10599                        FROM
10600                            (SELECT "#CHROM", POS, REF, ALT, "{transcripts_column_id}", {", ".join(clause_select)} FROM {transcripts_table})
10601                        )
10602                    GROUP BY "#CHROM", POS, REF, ALT
10603                ) AS t
10604                WHERE {table_variants}."#CHROM" = t."#CHROM"
10605                    AND {table_variants}."POS" = t."POS"
10606                    AND {table_variants}."REF" = t."REF"
10607                    AND {table_variants}."ALT" = t."ALT"
10608            """
10609
10610            self.execute_query(query=query_update)
10611
10612        return True

The transcript_view_to_variants function updates a variants table with information from transcripts in JSON format.

Parameters
  • transcripts_table: The transcripts_table parameter is used to specify the name of the table containing the transcripts data. If this parameter is not provided, the function will attempt to retrieve it from the param dictionary or use a default value of "transcripts"
  • transcripts_column_id: The transcripts_column_id parameter is used to specify the column in the transcripts_table that contains the unique identifier for each transcript. This identifier is used to match transcripts with variants in the database
  • transcripts_info_json: The transcripts_info_json parameter is used to specify the name of the column in the variants table where the transcripts information will be stored in JSON format. This parameter allows you to define the column in the variants table that will hold the JSON-formatted information about transcripts
  • transcripts_info_field_json: The transcripts_info_field_json parameter is used to specify the field in the VCF header that will contain information about transcripts in JSON format. This field will be added to the VCF header as an INFO field with the specified name
  • transcripts_info_format: The transcripts_info_format parameter is used to specify the format of the information about transcripts that will be stored in the variants table. This format can be used to define how the transcript information will be structured or displayed within the variants table
  • transcripts_info_field_format: The transcripts_info_field_format parameter is used to specify the field in the VCF header that will contain information about transcripts in a specific format. This field will be added to the VCF header as an INFO field with the specified name
  • param: The param parameter in the transcript_view_to_variants method is a dictionary that contains various configuration settings related to transcripts. It is used to provide default values for certain parameters if they are not explicitly provided when calling the method. The param dictionary can be passed as an argument
Returns

The function transcript_view_to_variants returns a boolean value. It returns True if the operation is successful and False if certain conditions are not met.